From 1a878af7fdb94c7248e8d75d2262f5a738acd24f Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Tue, 5 Mar 2024 13:12:33 +0100 Subject: [PATCH 01/18] Dilithium invNTT tests --- .../ntt_dilithium/intt_dilithium_1234_5678.s | 1 + .../intt_dilithium_1234_5678_twiddles.s | 1 + .../ntt_dilithium/intt_dilithium_123_45678.s | 1 + .../intt_dilithium_123_456_78_twiddles.s | 1 + tests/ntt_dilithium/main.c | 23 +- .../manual/intt_dilithium_1234_5678.s | 514 ++++++++++++++++ .../intt_dilithium_1234_5678_twiddles.s | 541 +++++++++++++++++ .../manual/intt_dilithium_123_45678.s | 523 ++++++++++++++++ .../intt_dilithium_123_456_78_twiddles.s | 557 ++++++++++++++++++ .../manual/ntt_dilithium_1234_5678.s | 16 +- 10 files changed, 2164 insertions(+), 14 deletions(-) create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_1234_5678.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_1234_5678_twiddles.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_123_45678.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_123_456_78_twiddles.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_1234_5678.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_1234_5678_twiddles.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_123_45678.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_123_456_78_twiddles.s diff --git a/asm/manual/ntt_dilithium/intt_dilithium_1234_5678.s b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678.s new file mode 120000 index 0000000..6b61898 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/aarch64/intt_dilithium_1234_5678.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_twiddles.s b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_twiddles.s new file mode 120000 index 0000000..07d8e03 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_twiddles.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/aarch64/intt_dilithium_1234_5678_twiddles.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_123_45678.s b/asm/manual/ntt_dilithium/intt_dilithium_123_45678.s new file mode 120000 index 0000000..a88552d --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_123_45678.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/aarch64/intt_dilithium_123_45678.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_123_456_78_twiddles.s b/asm/manual/ntt_dilithium/intt_dilithium_123_456_78_twiddles.s new file mode 120000 index 0000000..9bbf71b --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_123_456_78_twiddles.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/aarch64/intt_dilithium_123_456_78_twiddles.s \ No newline at end of file diff --git a/tests/ntt_dilithium/main.c b/tests/ntt_dilithium/main.c index 3055725..2e433d9 100644 --- a/tests/ntt_dilithium/main.c +++ b/tests/ntt_dilithium/main.c @@ -45,6 +45,8 @@ void ntt_dilithium_123_45678_w_scalar(int32_t *); void ntt_dilithium_123_45678_manual_st4(int32_t *); void ntt_dilithium_1234_5678(int32_t *); void ntt_dilithium_1234_5678_manual_st4(int32_t *); +void intt_dilithium_1234_5678(int32_t *); +void intt_dilithium_123_45678(int32_t *); // A55 void ntt_dilithium_123_45678_opt_a55(int32_t *); void ntt_dilithium_123_45678_manual_st4_opt_a55(int32_t *); @@ -197,7 +199,7 @@ static void ntt_u32_C(int32_t *a){ * @param q modulus * @return int 1 if there is an error, 0 otherwise */ -/*static int precomp_gs_negacyclic(T *twiddles, size_t n, T root, T q){ +static int precomp_gs_negacyclic(T *twiddles, size_t n, T root, T q){ //powers = [pow(root, -(i+1), q) for i in range(n)] T powers[n]; T rootInverse = base_root_inv; @@ -210,7 +212,7 @@ static void ntt_u32_C(int32_t *a){ twiddles[i] = powers[i]; } return 0; -}*/ +} /** * @brief Computes a Gentleman--Sande inverse FFT @@ -231,13 +233,13 @@ static void ntt_u32_C(int32_t *a){ * @param n size of the input * @param q modulus */ -/*static void invntt_u32_tomont_C(T *a){ +static void invntt_u32_tomont_C(T *a){ size_t logn = log2(NTT_SIZE); precomp_gs_negacyclic(roots, NTT_SIZE, base_root, modulus); int32_t *twiddles = roots; // printf("\n"); for(size_t i=0; i < logn; i++){ - // printf("layer: %ld\n", i+1); + // printf("layer: %ld\n", i+1); size_t distance = 1< + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. +// +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678 + .global _intt_dilithium_1234_5678 + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678: +_intt_dilithium_1234_5678: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 +layer5678_start: + // manual_ld4 + // ldr_vo data0, inp, (16*0) + // ldr_vo data1, inp, (16*1) + // ldr_vo data2, inp, (16*2) + // ldr_vo data3, inp, (16*3) + // transpose4 data + + ld4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp] + + load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr0 + + gs_butterfly_v data0, data1, root1, root1_tw + gs_butterfly_v data2, data3, root2, root2_tw + gs_butterfly_v data0, data2, root0, root0_tw + gs_butterfly_v data1, data3, root0, root0_tw + + transpose4 data + + load_next_roots_6 root1, r_ptr1 + load_next_roots_56 root0, r_ptr1 + + gs_butterfly data0, data1, root0, 0, 1 + gs_butterfly data2, data3, root0, 2, 3 + gs_butterfly data0, data2, root1, 0, 1 + gs_butterfly data1, data3, root1, 0, 1 + + montg_reduce data0 + montg_reduce data1 + + str_vi data0, inp, (16*4) + str_vo data1, inp, (-16*4 + 1*16) + str_vo data2, inp, (-16*4 + 2*16) + str_vo data3, inp, (-16*4 + 3*16) +// layer5678_end: + subs count, count, #1 + cbnz count, layer5678_start + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 +layer1234_start: + ldr_vo data0, in, 0 + ldr_vo data1, in, (1*(512/8)) + ldr_vo data2, in, (2*(512/8)) + ldr_vo data3, in, (3*(512/8)) + ldr_vo data4, in, (4*(512/8)) + ldr_vo data5, in, (5*(512/8)) + ldr_vo data6, in, (6*(512/8)) + ldr_vo data7, in, (7*(512/8)) + ldr_vo data8, in, (8*(512/8)) + ldr_vo data9, in, (9*(512/8)) + ldr_vo data10, in, (10*(512/8)) + ldr_vo data11, in, (11*(512/8)) + ldr_vo data12, in, (12*(512/8)) + ldr_vo data13, in, (13*(512/8)) + ldr_vo data14, in, (14*(512/8)) + ldr_vo data15, in, (15*(512/8)) + + // layer4 + gs_butterfly data0, data1, root3, 2, 3 + gs_butterfly data2, data3, root4, 0, 1 + gs_butterfly data4, data5, root4, 2, 3 + gs_butterfly data6, data7, root5, 0, 1 + gs_butterfly data8, data9, root5, 2, 3 + gs_butterfly data10, data11, root6, 0, 1 + gs_butterfly data12, data13, root6, 2, 3 + gs_butterfly data14, data15, root7, 0, 1 + + // layer3 + gs_butterfly data0, data2, root1, 2, 3 + gs_butterfly data1, data3, root1, 2, 3 + gs_butterfly data4, data6, root2, 0, 1 + gs_butterfly data5, data7, root2, 0, 1 + gs_butterfly data8, data10, root2, 2, 3 + gs_butterfly data9, data11, root2, 2, 3 + gs_butterfly data12, data14, root3, 0, 1 + gs_butterfly data13, data15, root3, 0, 1 + + // layer2 + gs_butterfly data0, data4, root0, 2, 3 + gs_butterfly data1, data5, root0, 2, 3 + gs_butterfly data2, data6, root0, 2, 3 + gs_butterfly data3, data7, root0, 2, 3 + gs_butterfly data8, data12, root1, 0, 1 + gs_butterfly data9, data13, root1, 0, 1 + gs_butterfly data10, data14, root1, 0, 1 + gs_butterfly data11, data15, root1, 0, 1 + + // layer 1 + gs_butterfly data0, data8, root0, 0, 1 + gs_butterfly data1, data9, root0, 0, 1 + gs_butterfly data2, data10, root0, 0, 1 + gs_butterfly data3, data11, root0, 0, 1 + gs_butterfly data4, data12, root0, 0, 1 + gs_butterfly data5, data13, root0, 0, 1 + gs_butterfly data6, data14, root0, 0, 1 + gs_butterfly data7, data15, root0, 0, 1 + + canonical_reduce data8, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data9, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data10, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data11, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data12, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data13, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data14, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data15, modulus_half, neg_modulus_half, t2, t3 + + str_vo data8, in, (8*(512/8)) + str_vo data9, in, (9*(512/8)) + str_vo data10, in, (10*(512/8)) + str_vo data11, in, (11*(512/8)) + str_vo data12, in, (12*(512/8)) + str_vo data13, in, (13*(512/8)) + str_vo data14, in, (14*(512/8)) + str_vo data15, in, (15*(512/8)) + + mul_ninv data8, data9, data10, data11, data12, data13, data14, data15, data0, data1, data2, data3, data4, data5, data6, data7 + + canonical_reduce data8, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data9, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data10, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data11, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data12, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data13, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data14, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data15, modulus_half, neg_modulus_half, t2, t3 + + str_vi data8, in, (16) + str_vo data9, in, (-16 + 1*(512/8)) + str_vo data10, in, (-16 + 2*(512/8)) + str_vo data11, in, (-16 + 3*(512/8)) + str_vo data12, in, (-16 + 4*(512/8)) + str_vo data13, in, (-16 + 5*(512/8)) + str_vo data14, in, (-16 + 6*(512/8)) + str_vo data15, in, (-16 + 7*(512/8)) + +// layer1234_end: + subs count, count, #1 + cbnz count, layer1234_start + + pop_stack + ret diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_twiddles.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_twiddles.s new file mode 100644 index 0000000..3589db6 --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_twiddles.s @@ -0,0 +1,541 @@ + +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +roots_l67: +.word -1744507 +.word 2236726 +.word 1922253 +.word 3818627 +.word -447030292 +.word 573161516 +.word 492577742 +.word 978523985 +.word 731434 +.word 781875 +.word 3773731 +.word -3531229 +.word 187430119 +.word 200355636 +.word 967019376 +.word -904878186 +.word -1054478 +.word -1900052 +.word 3974485 +.word 303005 +.word -270210213 +.word -486888731 +.word 1018462631 +.word 77645096 +.word 2354215 +.word -1011223 +.word 327848 +.word -348812 +.word 603268097 +.word -259126110 +.word 84011120 +.word -89383150 +.word 392707 +.word 1716814 +.word 2193087 +.word -3123762 +.word 100631253 +.word 439933955 +.word 561979013 +.word -800464680 +.word -2926054 +.word 3014420 +.word -2358373 +.word 2185084 +.word -749801963 +.word 772445769 +.word -604333585 +.word 559928242 +.word 459163 +.word 653275 +.word -2312838 +.word 3467665 +.word 117660617 +.word 167401858 +.word -592665232 +.word 888589898 +.word 1514152 +.word -3430436 +.word 553718 +.word 1103344 +.word 388001774 +.word -879049958 +.word 141890356 +.word 282732136 +.word -140244 +.word -860144 +.word -508145 +.word -3105558 +.word -35937555 +.word -220412084 +.word -130212265 +.word -795799901 +.word 2778788 +.word -2683270 +.word 2775755 +.word -1356448 +.word 712065019 +.word -687588511 +.word 711287812 +.word -347590090 +.word 770441 +.word -214880 +.word -3020393 +.word 11879 +.word 197425671 +.word -55063046 +.word -773976352 +.word 3043996 +.word -545376 +.word -3363542 +.word 1370517 +.word -3994671 +.word -139752717 +.word -861908357 +.word 351195274 +.word -1023635298 +.word -3374250 +.word -2925816 +.word 1226661 +.word -3901472 +.word -864652284 +.word -749740976 +.word 314332144 +.word -999753034 +.word 3369273 +.word -2028038 +.word -1723229 +.word -2569011 +.word 863376927 +.word -519685171 +.word -441577800 +.word -658309618 +.word -1163598 +.word -1665318 +.word 1615530 +.word -3980599 +.word -298172236 +.word -426738094 +.word 413979908 +.word -1020029345 +.word -621164 +.word -3035980 +.word -2461387 +.word 1317678 +.word -159173408 +.word -777970524 +.word -630730945 +.word 337655269 +.word 4022750 +.word -4148469 +.word -3009748 +.word 338420 +.word 1030830548 +.word -1063046068 +.word -771248568 +.word 86720197 +.word -749577 +.word 2612853 +.word -2647994 +.word 3033742 +.word -192079267 +.word 669544140 +.word -678549029 +.word 777397036 +.word 2362063 +.word 1300016 +.word 4182915 +.word -3482206 +.word 605279149 +.word 333129378 +.word 1071872863 +.word -892316032 +.word 1834526 +.word 1187885 +.word 1393159 +.word -1994046 +.word 470097680 +.word 304395785 +.word 356997292 +.word -510974714 +.word 724804 +.word -507927 +.word -2491325 +.word 1476985 +.word 185731180 +.word -130156402 +.word -638402564 +.word 378477722 +.word 2254727 +.word 2391089 +.word -1787943 +.word 2579253 +.word 577774276 +.word 612717067 +.word -458160776 +.word 660934133 +.word 2743411 +.word 1179613 +.word 2033807 +.word -2105286 +.word 702999655 +.word 302276083 +.word 521163479 +.word -539479988 +.word -527981 +.word -586241 +.word 2374402 +.word 1623354 +.word -135295244 +.word -150224382 +.word 608441020 +.word 415984810 +.word -3258457 +.word 3250154 +.word -235407 +.word -1736313 +.word -834980303 +.word 832852657 +.word -60323094 +.word -444930577 +.word 2178965 +.word 1879878 +.word 3472069 +.word 1921994 +.word 558360247 +.word 481719139 +.word 889718424 +.word 492511373 +.word 818761 +.word -2039144 +.word -4040196 +.word 458740 +.word 209807681 +.word -522531086 +.word -1035301089 +.word 117552223 +.word 3197248 +.word -1987814 +.word 3488383 +.word 4166425 +.word 819295484 +.word -509377762 +.word 893898890 +.word 1067647297 +.word 2218467 +.word -613238 +.word -2513018 +.word -141835 +.word 568482643 +.word -157142369 +.word -643961400 +.word -36345249 +.word 1310261 +.word 1354892 +.word 89301 +.word -2998219 +.word 335754661 +.word 347191365 +.word 22883400 +.word -768294260 +.word 3334383 +.word -2462444 +.word -169688 +.word 565603 +.word 854436357 +.word -631001801 +.word -43482586 +.word 144935890 +.word 12417 +.word -2642980 +.word 3838479 +.word -2296099 +.word 3181859 +.word -677264190 +.word 983611064 +.word -588375860 +.word -1254190 +.word -3195676 +.word -1239911 +.word -3747250 +.word -321386456 +.word -818892658 +.word -317727459 +.word -960233614 +.word 2962264 +.word -1148858 +.word -482649 +.word -1528066 +.word 759080783 +.word -294395108 +.word -123678909 +.word -391567239 +.word 3180456 +.word 3611750 +.word 1727088 +.word 1772588 +.word 814992530 +.word 925511710 +.word 442566669 +.word 454226054 +.word 268456 +.word -2387513 +.word -2192938 +.word 4146264 +.word 68791907 +.word -611800717 +.word -561940831 +.word 1062481036 +.word -4158088 +.word 1109516 +.word 2983781 +.word -2811291 +.word -1065510939 +.word 284313712 +.word 764594519 +.word -720393920 +.word 2455377 +.word -635956 +.word 3768948 +.word 3410568 +.word 629190881 +.word -162963861 +.word 965793731 +.word 873958779 +.word 250446 +.word 3551006 +.word -2678278 +.word 1685153 +.word 64176841 +.word 909946047 +.word -686309310 +.word 431820817 +.word 3815725 +.word -1937570 +.word -2028118 +.word -2508980 +.word 977780347 +.word -496502727 +.word -519705671 +.word -642926661 +.word 3759465 +.word -1596822 +.word 2454145 +.word -822541 +.word 963363710 +.word -409185979 +.word 628875181 +.word -210776307 +.word 3956944 +.word 1979497 +.word -1009365 +.word 27812 +.word 1013967746 +.word 507246529 +.word -258649997 +.word 7126831 +.word 274060 +.word 3121440 +.word 3222807 +.word -4183372 +.word 70227934 +.word 799869667 +.word 825844983 +.word -1071989969 +.word 3716946 +.word 2296397 +.word 3965306 +.word -87208 +.word 952468207 +.word 588452222 +.word 1016110510 +.word -22347069 +.word 3284915 +.word 3956745 +.word -636927 +.word -1182243 +.word 841760171 +.word 1013916752 +.word -163212680 +.word -302950022 +.word -3852015 +.word 2635473 +.word -1277625 +.word -3073009 +.word -987079667 +.word 675340520 +.word -327391679 +.word -787459213 +.word -2772600 +.word 1780227 +.word 1455890 +.word 1935420 +.word -710479343 +.word 456183549 +.word 373072124 +.word 495951789 +.word 59148 +.word -2660408 +.word 2659525 +.word -1753 +.word 15156688 +.word -681730119 +.word 681503850 +.word -449207 +roots_l45: +.word -2283733 +.word -585207070 +.word -1858416 +.word -476219497 +.word -3345963 +.word -857403734 +.word -2815639 +.word -721508096 +.word -1853806 +.word -475038184 +.word -2917338 +.word -747568486 +.word 3585098 +.word 918682129 +.word -3870317 +.word -991769559 +.word -556856 +.word -142694469 +.word 642628 +.word 164673562 +.word -3192354 +.word -818041395 +.word 2897314 +.word 742437332 +.word -1460718 +.word -374309300 +.word 3950053 +.word 1012201926 +.word 1716988 +.word 439978542 +.word -2453983 +.word -628833668 +.word 1935799 +.word 496048908 +.word -3756790 +.word -962678241 +.word -1714295 +.word -439288460 +.word 3574466 +.word 915957677 +.word 817536 +.word 209493775 +.word 3227876 +.word 827143915 +.word -1759347 +.word -450833045 +.word -3415069 +.word -875112161 +.word 1335936 +.word 342333886 +.word -2156050 +.word -552488273 +.word -3241972 +.word -830756018 +.word -676590 +.word -173376332 +.word 4018989 +.word 1029866791 +.word -2071829 +.word -530906624 +.word 434125 +.word 111244624 +.word 3506380 +.word 898510625 +.word -1095468 +.word -280713909 +.word 3524442 +.word 903139016 +.word -928749 +.word -237992130 +.word -394148 +.word -101000509 +.word 1674615 +.word 429120452 +.word -1159875 +.word -297218217 +.word -3704823 +.word -949361686 +.word -2663378 +.word -682491182 +.word -2101410 +.word -538486762 +.word 3110818 +.word 797147778 +.word 4063053 +.word 1041158200 +.word 3586446 +.word 919027554 +.word -2740543 +.word -702264730 +.word 3370349 +.word 863652652 +.word -3182878 +.word -815613168 +.word -3602218 +.word -923069133 +roots_l0123: +.word -294725 +.word -75523344 +.word -3761513 +.word -963888510 +.word -3765607 +.word -964937599 +.word 3201430 +.word 820367122 +.word 3145678 +.word 806080660 +.word 2883726 +.word 738955404 +.word 3201494 +.word 820383522 +.word 1221177 +.word 312926867 +.word -557458 +.word -142848732 +.word 1005239 +.word 257592709 +.word -3764867 +.word -964747974 +.word -2129892 +.word -545785280 +.word -2682288 +.word -687336873 +.word -3542485 +.word -907762539 +.word 601683 +.word 154181397 +.word 0 +.word 0 \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678.s new file mode 100644 index 0000000..8b85eb4 --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678.s @@ -0,0 +1,523 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. +// +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678 + .global _intt_dilithium_123_45678 + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678: +_intt_dilithium_123_45678: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 +layer45678_start: + // Standard way using vector instructions + ld4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp] + ld4 {data4.4S, data5.4S, data6.4S, data7.4S}, [inpp] + + load_roots_78_part1 + + // Layer 8 Part 1 + gs_butterfly_v data0, data1, root1, root1_tw + gs_butterfly_v data2, data3, root2, root2_tw + // Layer 7 Part 1 + gs_butterfly_v data0, data2, root0, root0_tw + gs_butterfly_v data1, data3, root0, root0_tw + + load_roots_78_part2 + + // Layer 8 Part 2 + gs_butterfly_v data4, data5, root1, root1_tw + gs_butterfly_v data6, data7, root2, root2_tw + // Layer 7 Part 2 + gs_butterfly_v data4, data6, root0, root0_tw + gs_butterfly_v data5, data7, root0, root0_tw + + transpose4 data0, data1, data2, data3 + transpose4 data4, data5, data6, data7 + + load_roots_456 + + // Layer 6 + gs_butterfly data0, data1, root1, 2, 3 + gs_butterfly data2, data3, root2, 0, 1 + gs_butterfly data4, data5, root2, 2, 3 + gs_butterfly data6, data7, root3, 0, 1 + + // Layer 5 + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root1, 0, 1 + gs_butterfly data5, data7, root1, 0, 1 + + // Layer 4 + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + // Standard way using vector instructions + + str_vi data0, inp, (16*4) + str_vo data1, inp, (-16*4 + 1*16) + str_vo data2, inp, (-16*4 + 2*16) + str_vo data3, inp, (-16*4 + 3*16) + + str_vi data4, inpp, (16*4) + str_vo data5, inpp, (-16*4 + 1*16) + str_vo data6, inpp, (-16*4 + 2*16) + str_vo data7, inpp, (-16*4 + 3*16) + + add inp, inp, #64 + add inpp, inpp, #64 + + subs count, count, #1 + cbnz count, layer45678_start + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 +layer123_start: + + ldr_vo data0, in, 0 + ldr_vo data1, in, (1*(1024/8)) + ldr_vo data2, in, (2*(1024/8)) + ldr_vo data3, in, (3*(1024/8)) + ldr_vo data4, in, (4*(1024/8)) + ldr_vo data5, in, (5*(1024/8)) + ldr_vo data6, in, (6*(1024/8)) + ldr_vo data7, in, (7*(1024/8)) + + gs_butterfly data0, data1, root1, 2, 3 + gs_butterfly data2, data3, root2, 0, 1 + gs_butterfly data4, data5, root2, 2, 3 + gs_butterfly data6, data7, root3, 0, 1 + + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root1, 0, 1 + gs_butterfly data5, data7, root1, 0, 1 + + // root0[0] includes ninv, manually computed. + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + montg_reduce data4 + montg_reduce data5 + montg_reduce data6 + montg_reduce data7 + + str_vo data4, in, (4*(1024/8)) + str_vo data5, in, (5*(1024/8)) + str_vo data6, in, (6*(1024/8)) + str_vo data7, in, (7*(1024/8)) + + mul_ninv data4, data5, data6, data7, data0, data1, data2, data3 + + str_vi data4, in, (16) + str_vo data5, in, (-16 + 1*(1024/8)) + str_vo data6, in, (-16 + 2*(1024/8)) + str_vo data7, in, (-16 + 3*(1024/8)) + + subs count, count, #1 + cbnz count, layer123_start + + pop_stack + ret diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_456_78_twiddles.s b/tests/ntt_dilithium/manual/intt_dilithium_123_456_78_twiddles.s new file mode 100644 index 0000000..43e0d17 --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_456_78_twiddles.s @@ -0,0 +1,557 @@ + +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +roots_l67: +.word -1744507 +.word 2236726 +.word 1922253 +.word 3818627 +.word -447030292 +.word 573161516 +.word 492577742 +.word 978523985 +.word 731434 +.word 781875 +.word 3773731 +.word -3531229 +.word 187430119 +.word 200355636 +.word 967019376 +.word -904878186 +.word -1054478 +.word -1900052 +.word 3974485 +.word 303005 +.word -270210213 +.word -486888731 +.word 1018462631 +.word 77645096 +.word 2354215 +.word -1011223 +.word 327848 +.word -348812 +.word 603268097 +.word -259126110 +.word 84011120 +.word -89383150 +.word 392707 +.word 1716814 +.word 2193087 +.word -3123762 +.word 100631253 +.word 439933955 +.word 561979013 +.word -800464680 +.word -2926054 +.word 3014420 +.word -2358373 +.word 2185084 +.word -749801963 +.word 772445769 +.word -604333585 +.word 559928242 +.word 459163 +.word 653275 +.word -2312838 +.word 3467665 +.word 117660617 +.word 167401858 +.word -592665232 +.word 888589898 +.word 1514152 +.word -3430436 +.word 553718 +.word 1103344 +.word 388001774 +.word -879049958 +.word 141890356 +.word 282732136 +.word -140244 +.word -860144 +.word -508145 +.word -3105558 +.word -35937555 +.word -220412084 +.word -130212265 +.word -795799901 +.word 2778788 +.word -2683270 +.word 2775755 +.word -1356448 +.word 712065019 +.word -687588511 +.word 711287812 +.word -347590090 +.word 770441 +.word -214880 +.word -3020393 +.word 11879 +.word 197425671 +.word -55063046 +.word -773976352 +.word 3043996 +.word -545376 +.word -3363542 +.word 1370517 +.word -3994671 +.word -139752717 +.word -861908357 +.word 351195274 +.word -1023635298 +.word -3374250 +.word -2925816 +.word 1226661 +.word -3901472 +.word -864652284 +.word -749740976 +.word 314332144 +.word -999753034 +.word 3369273 +.word -2028038 +.word -1723229 +.word -2569011 +.word 863376927 +.word -519685171 +.word -441577800 +.word -658309618 +.word -1163598 +.word -1665318 +.word 1615530 +.word -3980599 +.word -298172236 +.word -426738094 +.word 413979908 +.word -1020029345 +.word -621164 +.word -3035980 +.word -2461387 +.word 1317678 +.word -159173408 +.word -777970524 +.word -630730945 +.word 337655269 +.word 4022750 +.word -4148469 +.word -3009748 +.word 338420 +.word 1030830548 +.word -1063046068 +.word -771248568 +.word 86720197 +.word -749577 +.word 2612853 +.word -2647994 +.word 3033742 +.word -192079267 +.word 669544140 +.word -678549029 +.word 777397036 +.word 2362063 +.word 1300016 +.word 4182915 +.word -3482206 +.word 605279149 +.word 333129378 +.word 1071872863 +.word -892316032 +.word 1834526 +.word 1187885 +.word 1393159 +.word -1994046 +.word 470097680 +.word 304395785 +.word 356997292 +.word -510974714 +.word 724804 +.word -507927 +.word -2491325 +.word 1476985 +.word 185731180 +.word -130156402 +.word -638402564 +.word 378477722 +.word 2254727 +.word 2391089 +.word -1787943 +.word 2579253 +.word 577774276 +.word 612717067 +.word -458160776 +.word 660934133 +.word 2743411 +.word 1179613 +.word 2033807 +.word -2105286 +.word 702999655 +.word 302276083 +.word 521163479 +.word -539479988 +.word -527981 +.word -586241 +.word 2374402 +.word 1623354 +.word -135295244 +.word -150224382 +.word 608441020 +.word 415984810 +.word -3258457 +.word 3250154 +.word -235407 +.word -1736313 +.word -834980303 +.word 832852657 +.word -60323094 +.word -444930577 +.word 2178965 +.word 1879878 +.word 3472069 +.word 1921994 +.word 558360247 +.word 481719139 +.word 889718424 +.word 492511373 +.word 818761 +.word -2039144 +.word -4040196 +.word 458740 +.word 209807681 +.word -522531086 +.word -1035301089 +.word 117552223 +.word 3197248 +.word -1987814 +.word 3488383 +.word 4166425 +.word 819295484 +.word -509377762 +.word 893898890 +.word 1067647297 +.word 2218467 +.word -613238 +.word -2513018 +.word -141835 +.word 568482643 +.word -157142369 +.word -643961400 +.word -36345249 +.word 1310261 +.word 1354892 +.word 89301 +.word -2998219 +.word 335754661 +.word 347191365 +.word 22883400 +.word -768294260 +.word 3334383 +.word -2462444 +.word -169688 +.word 565603 +.word 854436357 +.word -631001801 +.word -43482586 +.word 144935890 +.word 12417 +.word -2642980 +.word 3838479 +.word -2296099 +.word 3181859 +.word -677264190 +.word 983611064 +.word -588375860 +.word -1254190 +.word -3195676 +.word -1239911 +.word -3747250 +.word -321386456 +.word -818892658 +.word -317727459 +.word -960233614 +.word 2962264 +.word -1148858 +.word -482649 +.word -1528066 +.word 759080783 +.word -294395108 +.word -123678909 +.word -391567239 +.word 3180456 +.word 3611750 +.word 1727088 +.word 1772588 +.word 814992530 +.word 925511710 +.word 442566669 +.word 454226054 +.word 268456 +.word -2387513 +.word -2192938 +.word 4146264 +.word 68791907 +.word -611800717 +.word -561940831 +.word 1062481036 +.word -4158088 +.word 1109516 +.word 2983781 +.word -2811291 +.word -1065510939 +.word 284313712 +.word 764594519 +.word -720393920 +.word 2455377 +.word -635956 +.word 3768948 +.word 3410568 +.word 629190881 +.word -162963861 +.word 965793731 +.word 873958779 +.word 250446 +.word 3551006 +.word -2678278 +.word 1685153 +.word 64176841 +.word 909946047 +.word -686309310 +.word 431820817 +.word 3815725 +.word -1937570 +.word -2028118 +.word -2508980 +.word 977780347 +.word -496502727 +.word -519705671 +.word -642926661 +.word 3759465 +.word -1596822 +.word 2454145 +.word -822541 +.word 963363710 +.word -409185979 +.word 628875181 +.word -210776307 +.word 3956944 +.word 1979497 +.word -1009365 +.word 27812 +.word 1013967746 +.word 507246529 +.word -258649997 +.word 7126831 +.word 274060 +.word 3121440 +.word 3222807 +.word -4183372 +.word 70227934 +.word 799869667 +.word 825844983 +.word -1071989969 +.word 3716946 +.word 2296397 +.word 3965306 +.word -87208 +.word 952468207 +.word 588452222 +.word 1016110510 +.word -22347069 +.word 3284915 +.word 3956745 +.word -636927 +.word -1182243 +.word 841760171 +.word 1013916752 +.word -163212680 +.word -302950022 +.word -3852015 +.word 2635473 +.word -1277625 +.word -3073009 +.word -987079667 +.word 675340520 +.word -327391679 +.word -787459213 +.word -2772600 +.word 1780227 +.word 1455890 +.word 1935420 +.word -710479343 +.word 456183549 +.word 373072124 +.word 495951789 +.word 59148 +.word -2660408 +.word 2659525 +.word -1753 +.word 15156688 +.word -681730119 +.word 681503850 +.word -449207 +roots_l345: +.word 1221177 +.word 312926867 +.word -2283733 +.word -585207070 +.word -2815639 +.word -721508096 +.word -1858416 +.word -476219497 +.word -3345963 +.word -857403734 +.word -1853806 +.word -475038184 +.word -2917338 +.word -747568486 +.word 0 +.word 0 +.word -557458 +.word -142848732 +.word 3585098 +.word 918682129 +.word 642628 +.word 164673562 +.word -3870317 +.word -991769559 +.word -556856 +.word -142694469 +.word -3192354 +.word -818041395 +.word 2897314 +.word 742437332 +.word 0 +.word 0 +.word 1005239 +.word 257592709 +.word -1460718 +.word -374309300 +.word -2453983 +.word -628833668 +.word 3950053 +.word 1012201926 +.word 1716988 +.word 439978542 +.word 1935799 +.word 496048908 +.word -3756790 +.word -962678241 +.word 0 +.word 0 +.word -3764867 +.word -964747974 +.word -1714295 +.word -439288460 +.word 3227876 +.word 827143915 +.word 3574466 +.word 915957677 +.word 817536 +.word 209493775 +.word -1759347 +.word -450833045 +.word -3415069 +.word -875112161 +.word 0 +.word 0 +.word -2129892 +.word -545785280 +.word 1335936 +.word 342333886 +.word -676590 +.word -173376332 +.word -2156050 +.word -552488273 +.word -3241972 +.word -830756018 +.word 4018989 +.word 1029866791 +.word -2071829 +.word -530906624 +.word 0 +.word 0 +.word -2682288 +.word -687336873 +.word 434125 +.word 111244624 +.word 3524442 +.word 903139016 +.word 3506380 +.word 898510625 +.word -1095468 +.word -280713909 +.word -928749 +.word -237992130 +.word -394148 +.word -101000509 +.word 0 +.word 0 +.word -3542485 +.word -907762539 +.word 1674615 +.word 429120452 +.word -2663378 +.word -682491182 +.word -1159875 +.word -297218217 +.word -3704823 +.word -949361686 +.word -2101410 +.word -538486762 +.word 3110818 +.word 797147778 +.word 0 +.word 0 +.word 601683 +.word 154181397 +.word 4063053 +.word 1041158200 +.word 3370349 +.word 863652652 +.word 3586446 +.word 919027554 +.word -2740543 +.word -702264730 +.word -3182878 +.word -815613168 +.word -3602218 +.word -923069133 +.word 0 +.word 0 +roots_l012: +.word -294725 +.word -75523344 +.word -3761513 +.word -963888510 +.word -3765607 +.word -964937599 +.word 3201430 +.word 820367122 +.word 3145678 +.word 806080660 +.word 2883726 +.word 738955404 +.word 3201494 +.word 820383522 +.word 0 +.word 0 \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/ntt_dilithium_1234_5678.s b/tests/ntt_dilithium/manual/ntt_dilithium_1234_5678.s index c8371e8..ae63345 100644 --- a/tests/ntt_dilithium/manual/ntt_dilithium_1234_5678.s +++ b/tests/ntt_dilithium/manual/ntt_dilithium_1234_5678.s @@ -137,7 +137,7 @@ trn1 \data\()1\().2d, t1.2d, t3.2d .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -148,7 +148,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -158,7 +158,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -166,7 +166,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -177,19 +177,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs From e4cd6916f157f482937acba8e8e79a59fc3658d0 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Tue, 5 Mar 2024 14:29:23 +0100 Subject: [PATCH 02/18] manual_ld4 for Dilithium invNTT --- .../intt_dilithium_1234_5678_manual_ld4.s | 1 + .../intt_dilithium_123_45678_manual_ld4.s | 1 + tests/ntt_dilithium/main.c | 20 + .../intt_dilithium_1234_5678_manual_ld4.s | 512 +++++++++++++++++ .../intt_dilithium_123_45678_manual_ld4.s | 532 ++++++++++++++++++ 5 files changed, 1066 insertions(+) create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4.s diff --git a/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4.s b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4.s new file mode 120000 index 0000000..0a75f2c --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/aarch64/intt_dilithium_1234_5678_manual_ld4.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4.s b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4.s new file mode 120000 index 0000000..70f5584 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/aarch64/intt_dilithium_123_45678_manual_ld4.s \ No newline at end of file diff --git a/tests/ntt_dilithium/main.c b/tests/ntt_dilithium/main.c index 2e433d9..07311fd 100644 --- a/tests/ntt_dilithium/main.c +++ b/tests/ntt_dilithium/main.c @@ -46,7 +46,9 @@ void ntt_dilithium_123_45678_manual_st4(int32_t *); void ntt_dilithium_1234_5678(int32_t *); void ntt_dilithium_1234_5678_manual_st4(int32_t *); void intt_dilithium_1234_5678(int32_t *); +void intt_dilithium_1234_5678_manual_ld4(int32_t *); void intt_dilithium_123_45678(int32_t *); +void intt_dilithium_123_45678_manual_ld4(int32_t *); // A55 void ntt_dilithium_123_45678_opt_a55(int32_t *); void ntt_dilithium_123_45678_manual_st4_opt_a55(int32_t *); @@ -335,7 +337,9 @@ MAKE_TEST(asm_123_45678_manual_st4,0,ntt_dilithium_123_45678_manual_st4,ntt_u32_ MAKE_TEST(asm_1234_5678,0,ntt_dilithium_1234_5678,ntt_u32_C,0,0) MAKE_TEST(asm_1234_5678_manual_st4,0,ntt_dilithium_1234_5678_manual_st4,ntt_u32_C,0,0) MAKE_TEST(asm_1234_5678_inv,0,intt_dilithium_1234_5678,invntt_u32_tomont_C,0,1) +MAKE_TEST(asm_1234_5678_inv_manual_ld4,0,intt_dilithium_1234_5678_manual_ld4,invntt_u32_tomont_C,0,1) MAKE_TEST(asm_123_45678_inv,0,intt_dilithium_123_45678,invntt_u32_tomont_C,0,1) +MAKE_TEST(asm_123_45678_inv_manual_ld4,0,intt_dilithium_123_45678_manual_ld4,invntt_u32_tomont_C,0,1) // A55 MAKE_TEST(asm_123_45678_opt_a55,0,ntt_dilithium_123_45678_opt_a55,ntt_u32_C,0,0) MAKE_TEST(asm_123_45678_manual_st4_opt_a55,0,ntt_dilithium_123_45678_manual_st4_opt_a55,ntt_u32_C,0,0) @@ -395,6 +399,10 @@ MAKE_BENCH(asm_123_45678_w_scalar,ntt_dilithium_123_45678_w_scalar) MAKE_BENCH(asm_123_45678_manual_st4,ntt_dilithium_123_45678_manual_st4) MAKE_BENCH(asm_1234_5678,ntt_dilithium_1234_5678) MAKE_BENCH(asm_1234_5678_manual_st4,ntt_dilithium_1234_5678_manual_st4) +MAKE_BENCH(asm_1234_5678_inv,intt_dilithium_1234_5678) +MAKE_BENCH(asm_1234_5678_inv_manual_ld4,intt_dilithium_1234_5678_manual_ld4) +MAKE_BENCH(asm_123_45678_inv,intt_dilithium_123_45678) +MAKE_BENCH(asm_123_45678_inv_manual_ld4,intt_dilithium_123_45678_manual_ld4) // A55 MAKE_BENCH(asm_123_45678_opt_a55,ntt_dilithium_123_45678_opt_a55) MAKE_BENCH(asm_123_45678_manual_st4_opt_a55,ntt_dilithium_123_45678_manual_st4_opt_a55) @@ -432,6 +440,10 @@ int main( void ) bench_ntt_asm_123_45678_manual_st4(); bench_ntt_asm_1234_5678(); bench_ntt_asm_1234_5678_manual_st4(); + bench_ntt_asm_1234_5678_inv(); + bench_ntt_asm_1234_5678_inv_manual_ld4(); + bench_ntt_asm_123_45678_inv(); + bench_ntt_asm_123_45678_inv_manual_ld4(); // A55 bench_ntt_asm_123_45678_opt_a55(); bench_ntt_asm_123_45678_manual_st4_opt_a55(); @@ -485,10 +497,18 @@ int main( void ) { return 1; } + if (test_ntt_asm_1234_5678_inv_manual_ld4() != 0) + { + return 1; + } if (test_ntt_asm_123_45678_inv() != 0) { return 1; } + if (test_ntt_asm_123_45678_inv_manual_ld4() != 0) + { + return 1; + } // A55 if (test_ntt_asm_123_45678_opt_a55() != 0) { diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4.s new file mode 100644 index 0000000..e7e3c1d --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4.s @@ -0,0 +1,512 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. +// +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_manual_ld4 + .global _intt_dilithium_1234_5678_manual_ld4 + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_manual_ld4: +_intt_dilithium_1234_5678_manual_ld4: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 +layer5678_start: + // manual_ld4 + ldr_vo data0, inp, (16*0) + ldr_vo data1, inp, (16*1) + ldr_vo data2, inp, (16*2) + ldr_vo data3, inp, (16*3) + transpose4 data + + load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr0 + + gs_butterfly_v data0, data1, root1, root1_tw + gs_butterfly_v data2, data3, root2, root2_tw + gs_butterfly_v data0, data2, root0, root0_tw + gs_butterfly_v data1, data3, root0, root0_tw + + transpose4 data + + load_next_roots_6 root1, r_ptr1 + load_next_roots_56 root0, r_ptr1 + + gs_butterfly data0, data1, root0, 0, 1 + gs_butterfly data2, data3, root0, 2, 3 + gs_butterfly data0, data2, root1, 0, 1 + gs_butterfly data1, data3, root1, 0, 1 + + montg_reduce data0 + montg_reduce data1 + + str_vi data0, inp, (16*4) + str_vo data1, inp, (-16*4 + 1*16) + str_vo data2, inp, (-16*4 + 2*16) + str_vo data3, inp, (-16*4 + 3*16) +// layer5678_end: + subs count, count, #1 + cbnz count, layer5678_start + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 +layer1234_start: + ldr_vo data0, in, 0 + ldr_vo data1, in, (1*(512/8)) + ldr_vo data2, in, (2*(512/8)) + ldr_vo data3, in, (3*(512/8)) + ldr_vo data4, in, (4*(512/8)) + ldr_vo data5, in, (5*(512/8)) + ldr_vo data6, in, (6*(512/8)) + ldr_vo data7, in, (7*(512/8)) + ldr_vo data8, in, (8*(512/8)) + ldr_vo data9, in, (9*(512/8)) + ldr_vo data10, in, (10*(512/8)) + ldr_vo data11, in, (11*(512/8)) + ldr_vo data12, in, (12*(512/8)) + ldr_vo data13, in, (13*(512/8)) + ldr_vo data14, in, (14*(512/8)) + ldr_vo data15, in, (15*(512/8)) + + // layer4 + gs_butterfly data0, data1, root3, 2, 3 + gs_butterfly data2, data3, root4, 0, 1 + gs_butterfly data4, data5, root4, 2, 3 + gs_butterfly data6, data7, root5, 0, 1 + gs_butterfly data8, data9, root5, 2, 3 + gs_butterfly data10, data11, root6, 0, 1 + gs_butterfly data12, data13, root6, 2, 3 + gs_butterfly data14, data15, root7, 0, 1 + + // layer3 + gs_butterfly data0, data2, root1, 2, 3 + gs_butterfly data1, data3, root1, 2, 3 + gs_butterfly data4, data6, root2, 0, 1 + gs_butterfly data5, data7, root2, 0, 1 + gs_butterfly data8, data10, root2, 2, 3 + gs_butterfly data9, data11, root2, 2, 3 + gs_butterfly data12, data14, root3, 0, 1 + gs_butterfly data13, data15, root3, 0, 1 + + // layer2 + gs_butterfly data0, data4, root0, 2, 3 + gs_butterfly data1, data5, root0, 2, 3 + gs_butterfly data2, data6, root0, 2, 3 + gs_butterfly data3, data7, root0, 2, 3 + gs_butterfly data8, data12, root1, 0, 1 + gs_butterfly data9, data13, root1, 0, 1 + gs_butterfly data10, data14, root1, 0, 1 + gs_butterfly data11, data15, root1, 0, 1 + + // layer 1 + gs_butterfly data0, data8, root0, 0, 1 + gs_butterfly data1, data9, root0, 0, 1 + gs_butterfly data2, data10, root0, 0, 1 + gs_butterfly data3, data11, root0, 0, 1 + gs_butterfly data4, data12, root0, 0, 1 + gs_butterfly data5, data13, root0, 0, 1 + gs_butterfly data6, data14, root0, 0, 1 + gs_butterfly data7, data15, root0, 0, 1 + + canonical_reduce data8, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data9, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data10, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data11, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data12, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data13, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data14, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data15, modulus_half, neg_modulus_half, t2, t3 + + str_vo data8, in, (8*(512/8)) + str_vo data9, in, (9*(512/8)) + str_vo data10, in, (10*(512/8)) + str_vo data11, in, (11*(512/8)) + str_vo data12, in, (12*(512/8)) + str_vo data13, in, (13*(512/8)) + str_vo data14, in, (14*(512/8)) + str_vo data15, in, (15*(512/8)) + + mul_ninv data8, data9, data10, data11, data12, data13, data14, data15, data0, data1, data2, data3, data4, data5, data6, data7 + + canonical_reduce data8, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data9, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data10, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data11, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data12, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data13, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data14, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data15, modulus_half, neg_modulus_half, t2, t3 + + str_vi data8, in, (16) + str_vo data9, in, (-16 + 1*(512/8)) + str_vo data10, in, (-16 + 2*(512/8)) + str_vo data11, in, (-16 + 3*(512/8)) + str_vo data12, in, (-16 + 4*(512/8)) + str_vo data13, in, (-16 + 5*(512/8)) + str_vo data14, in, (-16 + 6*(512/8)) + str_vo data15, in, (-16 + 7*(512/8)) + +// layer1234_end: + subs count, count, #1 + cbnz count, layer1234_start + + pop_stack + ret diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4.s new file mode 100644 index 0000000..c0cd992 --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4.s @@ -0,0 +1,532 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. +// +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_manual_ld4 + .global _intt_dilithium_123_45678_manual_ld4 + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_manual_ld4: +_intt_dilithium_123_45678_manual_ld4: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 +layer45678_start: + // Manual ld4 using vector instructions + ldr_vo data0, inp, 0 + ldr_vo data1, inp, 16 + ldr_vo data2, inp, 32 + ldr_vo data3, inp, 48 + transpose4 data0, data1, data2, data3 + + ldr_vo data4, inpp, 0 + ldr_vo data5, inpp, 16 + ldr_vo data6, inpp, 32 + ldr_vo data7, inpp, 48 + transpose4 data4, data5, data6, data7 + + load_roots_78_part1 + + // Layer 8 Part 1 + gs_butterfly_v data0, data1, root1, root1_tw + gs_butterfly_v data2, data3, root2, root2_tw + // Layer 7 Part 1 + gs_butterfly_v data0, data2, root0, root0_tw + gs_butterfly_v data1, data3, root0, root0_tw + + load_roots_78_part2 + + // Layer 8 Part 2 + gs_butterfly_v data4, data5, root1, root1_tw + gs_butterfly_v data6, data7, root2, root2_tw + // Layer 7 Part 2 + gs_butterfly_v data4, data6, root0, root0_tw + gs_butterfly_v data5, data7, root0, root0_tw + + transpose4 data0, data1, data2, data3 + transpose4 data4, data5, data6, data7 + + load_roots_456 + + // Layer 6 + gs_butterfly data0, data1, root1, 2, 3 + gs_butterfly data2, data3, root2, 0, 1 + gs_butterfly data4, data5, root2, 2, 3 + gs_butterfly data6, data7, root3, 0, 1 + + // Layer 5 + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root1, 0, 1 + gs_butterfly data5, data7, root1, 0, 1 + + // Layer 4 + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + // Standard way using vector instructions + + str_vi data0, inp, (16*4) + str_vo data1, inp, (-16*4 + 1*16) + str_vo data2, inp, (-16*4 + 2*16) + str_vo data3, inp, (-16*4 + 3*16) + + str_vi data4, inpp, (16*4) + str_vo data5, inpp, (-16*4 + 1*16) + str_vo data6, inpp, (-16*4 + 2*16) + str_vo data7, inpp, (-16*4 + 3*16) + + add inp, inp, #64 + add inpp, inpp, #64 + + subs count, count, #1 + cbnz count, layer45678_start + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 +layer123_start: + + ldr_vo data0, in, 0 + ldr_vo data1, in, (1*(1024/8)) + ldr_vo data2, in, (2*(1024/8)) + ldr_vo data3, in, (3*(1024/8)) + ldr_vo data4, in, (4*(1024/8)) + ldr_vo data5, in, (5*(1024/8)) + ldr_vo data6, in, (6*(1024/8)) + ldr_vo data7, in, (7*(1024/8)) + + gs_butterfly data0, data1, root1, 2, 3 + gs_butterfly data2, data3, root2, 0, 1 + gs_butterfly data4, data5, root2, 2, 3 + gs_butterfly data6, data7, root3, 0, 1 + + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root1, 0, 1 + gs_butterfly data5, data7, root1, 0, 1 + + // root0[0] includes ninv, manually computed. + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + montg_reduce data4 + montg_reduce data5 + montg_reduce data6 + montg_reduce data7 + + str_vo data4, in, (4*(1024/8)) + str_vo data5, in, (5*(1024/8)) + str_vo data6, in, (6*(1024/8)) + str_vo data7, in, (7*(1024/8)) + + mul_ninv data4, data5, data6, data7, data0, data1, data2, data3 + + str_vi data4, in, (16) + str_vo data5, in, (-16 + 1*(1024/8)) + str_vo data6, in, (-16 + 2*(1024/8)) + str_vo data7, in, (-16 + 3*(1024/8)) + + subs count, count, #1 + cbnz count, layer123_start + + pop_stack + ret From b0c82c108ad31f462ecd066fe67ef5563ee66ed9 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Tue, 5 Mar 2024 17:07:23 +0100 Subject: [PATCH 03/18] Kyber Reference Code * Switch to code by Matthias * Add ref for invNTT --- tests/ntt_kyber/main.c | 257 +++++++++++++++++++++++++++++++---------- 1 file changed, 196 insertions(+), 61 deletions(-) diff --git a/tests/ntt_kyber/main.c b/tests/ntt_kyber/main.c index d94d6f3..621c879 100644 --- a/tests/ntt_kyber/main.c +++ b/tests/ntt_kyber/main.c @@ -34,6 +34,7 @@ #include #include #include +#include #define WARMUP_ITERATIONS 1000 #define ITER_PER_TEST 1000 @@ -89,25 +90,31 @@ void ntt_kyber_123_4567_scalar_store_opt_m1_icestorm(int16_t *); #include #include "neonntt.h" #include "pqclean.h" + +#define T int16_t +#define T2 int32_t + /* * Test cases - */ +*/ int16_t base_root = 17; int16_t modulus = 3329; uint16_t modulus_inv_u16 = 62209; +int16_t ninvR = 2285; // TODO FIX +int16_t base_root_inv = 1175; int16_t roots [NTT_ROOT_ORDER / 2] __attribute__((aligned(16))) = { 0 }; uint16_t roots_twisted[NTT_ROOT_ORDER / 2] __attribute__((aligned(16))) = { 0 }; -void build_roots() -{ - for( unsigned i=0; i < NTT_ROOT_ORDER / 2; i++ ) - { - roots[i] = mod_pow_s16( base_root, i, modulus ); - roots_twisted[i] = roots[i] * modulus_inv_u16; - } -} +// void build_roots() +// { +// for( unsigned i=0; i < NTT_ROOT_ORDER / 2; i++ ) +// { +// roots[i] = mod_pow_s16( base_root, i, modulus ); +// roots_twisted[i] = roots[i] * modulus_inv_u16; +// } +// } unsigned bit_reverse( unsigned in, unsigned width ) { @@ -126,46 +133,173 @@ static int cmp_uint64_t(const void *a, const void *b) return (int)((*((const uint64_t *)a)) - (*((const uint64_t *)b))); } -void ntt_s16_C( int16_t *src ) -{ - int16_t res[NTT_SIZE]; - build_roots(); +// NTT FFT reference code form +// https://github.com/mkannwischer/polymul/blob/072248095f5ef14f874e73772525cab68fb9d454/C/07incomplete.c +// slightly modified - for( unsigned t=0; t= ( NTT_ROOT_ORDER / 2 ) ); - exp = exp % ( NTT_ROOT_ORDER / 2 ); - - cur = mod_mul_s16( src[NTT_LAYER_STRIDE*j+t], - roots[exp], - modulus ); - - if( !sub ) - tmp = mod_add_s16( tmp, cur, modulus ); - else - tmp = mod_sub_s16( tmp, cur, modulus ); - } - res[NTT_LAYER_STRIDE*i+t] = tmp; +/** + * @brief Bitreverse an array of length n inplace + * + * @param src array + * @param n length of array + */ +void bitreverse(T *src, size_t n){ + for(size_t i = 0, j = 0; i < n; i++){ + if(i < j){ + src[i] += src[j]; + src[i] -= (src[j] = (src[i] - src[j])); } + for(size_t k = n >> 1; (j ^= k) < k; k >>=1); } +} - mod_reduce_buf_s16_signed( res, NTT_SIZE, modulus ); - memcpy( src, res, sizeof( res ) ); +/** + * @brief Precompute the required twiddle factors for a incomplete negacyclic Cooley--Tukey FFT + * + * First layer: [-1] = [root^(n/2)] + * Second layer: [sqrt(-1), -sqrt(-1)] = [root^(n/4), root^(3n/4)] + * Third layer: [sqrt(root^(n/4)), -sqrt(root^(n/4)), sqrt(root^(3n/4)), -sqrt(root^(3n/4))] + =[root^(n/8), root^(5n/8), root^(3n/8), root^(7n/8)] + * ... + * + * @param twiddles output buffer for the twiddles. needs to hold (2^numLayers)-1 twiddles + * @param n number of coefficients in polynomials (not size of the NTT) + * @param root 2*(2^numLayers)-th primitive root of unity modulo q + * @param q modulus + * @param numLayers number of layers in the NTT. Needs to be <= log n + * @return int 1 if there is an error, 0 otherwise + */ +static int precomp_ct_negacyclic(T *twiddles, size_t n, T root, T q, size_t numLayers){ + //powers = [pow(root, i, q) for i in range(2**numLayers//2)] + T powers[(1< Date: Tue, 5 Mar 2024 17:07:37 +0100 Subject: [PATCH 04/18] Unify types for Dilithium test --- tests/ntt_dilithium/main.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/ntt_dilithium/main.c b/tests/ntt_dilithium/main.c index 07311fd..aad95cb 100644 --- a/tests/ntt_dilithium/main.c +++ b/tests/ntt_dilithium/main.c @@ -109,7 +109,7 @@ static int cmp_uint64_t(const void *a, const void *b) * @param src array * @param n length of array */ -void bitreverse(int32_t *src, size_t n){ +void bitreverse(T *src, size_t n){ for(size_t i = 0, j = 0; i < n; i++){ if(i < j){ src[i] += src[j]; @@ -133,12 +133,12 @@ void bitreverse(int32_t *src, size_t n){ * @param q modulus * @return int 1 if there is an error, 0 otherwise */ -static int precomp_ct_negacyclic(int32_t *twiddles, size_t n, int32_t root, int32_t q){ +static int precomp_ct_negacyclic(T *twiddles, size_t n, T root, T q){ - int32_t powers[n]; + T powers[n]; powers[0] = 1; for(size_t i=1;i Date: Wed, 6 Mar 2024 15:43:40 +0100 Subject: [PATCH 05/18] Kyber invNTT test --- asm/manual/ntt_kyber/intt_kyber_123_4567.s | 1 + .../ntt_kyber/intt_kyber_123_45_67_twiddles.s | 1 + tests/ntt_kyber/main.c | 154 +++--- tests/ntt_kyber/manual/intt_kyber_123_4567.s | 466 +++++++++++++++++ .../manual/intt_kyber_123_45_67_twiddles.s | 494 ++++++++++++++++++ 5 files changed, 1054 insertions(+), 62 deletions(-) create mode 120000 asm/manual/ntt_kyber/intt_kyber_123_4567.s create mode 120000 asm/manual/ntt_kyber/intt_kyber_123_45_67_twiddles.s create mode 100644 tests/ntt_kyber/manual/intt_kyber_123_4567.s create mode 100644 tests/ntt_kyber/manual/intt_kyber_123_45_67_twiddles.s diff --git a/asm/manual/ntt_kyber/intt_kyber_123_4567.s b/asm/manual/ntt_kyber/intt_kyber_123_4567.s new file mode 120000 index 0000000..95209bb --- /dev/null +++ b/asm/manual/ntt_kyber/intt_kyber_123_4567.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/aarch64/intt_kyber_123_4567.s \ No newline at end of file diff --git a/asm/manual/ntt_kyber/intt_kyber_123_45_67_twiddles.s b/asm/manual/ntt_kyber/intt_kyber_123_45_67_twiddles.s new file mode 120000 index 0000000..764f1d8 --- /dev/null +++ b/asm/manual/ntt_kyber/intt_kyber_123_45_67_twiddles.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/aarch64/intt_kyber_123_45_67_twiddles.s \ No newline at end of file diff --git a/tests/ntt_kyber/main.c b/tests/ntt_kyber/main.c index 621c879..c4ab12f 100644 --- a/tests/ntt_kyber/main.c +++ b/tests/ntt_kyber/main.c @@ -43,6 +43,7 @@ /* Add declarationa for ASM NTTs here */ // base void ntt_kyber_123_4567(int16_t *); +void intt_kyber_123_4567(int16_t *); void ntt_kyber_123_4567_scalar_load(int16_t *); void ntt_kyber_123_4567_scalar_load_store(int16_t *); void ntt_kyber_123_4567_scalar_store(int16_t *); @@ -101,7 +102,8 @@ void ntt_kyber_123_4567_scalar_store_opt_m1_icestorm(int16_t *); int16_t base_root = 17; int16_t modulus = 3329; uint16_t modulus_inv_u16 = 62209; -int16_t ninvR = 2285; // TODO FIX +int16_t ninv = 1441; // TODO FIX +int16_t ninvR = 512; // TODO for from_mont int16_t base_root_inv = 1175; int16_t roots [NTT_ROOT_ORDER / 2] __attribute__((aligned(16))) = { 0 }; @@ -196,20 +198,20 @@ static int precomp_ct_negacyclic(T *twiddles, size_t n, T root, T q, size_t numL * @param numLayers number of layers in the NTT. Needs to be <= log n * @return int 1 if there is an error, 0 otherwise */ -// static int precomp_gs_negacyclic(T *twiddles, size_t n, T root, T q, size_t numLayers){ -// //powers = [pow(root, -(i+1), q) for i in range(2**numLayers)] -// T powers[(1<%d, %d\n", a[idx0], a[idx1]); + } + } + } -// // Note: Half of these multiplications can be merged into the last -// // layer of butterflies by pre-computing (twiddle*ninv)%q -// for(size_t i=0;i + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. +// +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0\().4s, \data\()1\().4s + trn2 t1.4s, \data\()0\().4s, \data\()1\().4s + trn1 t2.4s, \data\()2\().4s, \data\()3\().4s + trn2 t3.4s, \data\()2\().4s, \data\()3\().4s + + trn2 \data\()2\().2d, t0.2d, t2.2d + trn2 \data\()3\().2d, t1.2d, t3.2d + trn1 \data\()0\().2d, t0.2d, t2.2d + trn1 \data\()1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0\().4s, \data_in\()0\().4s, \data_in\()1\().4s + trn2 \data_out\()1\().4s, \data_in\()0\().4s, \data_in\()1\().4s + trn1 \data_out\()2\().4s, \data_in\()2\().4s, \data_in\()3\().4s + trn2 \data_out\()3\().4s, \data_in\()2\().4s, \data_in\()3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567 + .global _intt_kyber_123_4567 + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567: +_intt_kyber_123_4567: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 +layer4567_start: + ldr_vo data0, inp, (16*0) + ldr_vo data1, inp, (16*1) + ldr_vo data2, inp, (16*2) + ldr_vo data3, inp, (16*3) + + transpose4 data // manual ld4 + + load_next_roots_67 + + // Layer 7 + gs_butterfly_v data0, data1, root1, root1_tw + gs_butterfly_v data2, data3, root2, root2_tw + // Layer 6 + gs_butterfly_v data0, data2, root0, root0_tw + gs_butterfly_v data1, data3, root0, root0_tw + + transpose4 data + + load_next_roots_45 + + // Layer 5 + gs_butterfly data0, data1, root0, 2, 3 + gs_butterfly data2, data3, root0, 4, 5 + // Layer 4 + gs_butterfly data0, data2, root0, 0, 1 + gs_butterfly data1, data3, root0, 0, 1 + + // or Montgomery? + barrett_reduce data0 + barrett_reduce data1 + barrett_reduce data2 + barrett_reduce data3 + str_vi data0, inp, (64) + str_vo data1, inp, (-64 + 16*1) + str_vo data2, inp, (-64 + 16*2) + str_vo data3, inp, (-64 + 16*3) + + subs count, count, #1 + cbnz count, layer4567_start + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + +layer123_start: + + ldr_vo data0, in, 0 + ldr_vo data1, in, (1*(512/8)) + ldr_vo data2, in, (2*(512/8)) + ldr_vo data3, in, (3*(512/8)) + ldr_vo data4, in, (4*(512/8)) + ldr_vo data5, in, (5*(512/8)) + ldr_vo data6, in, (6*(512/8)) + ldr_vo data7, in, (7*(512/8)) + + gs_butterfly data0, data1, root0, 6, 7 + gs_butterfly data2, data3, root1, 0, 1 + gs_butterfly data4, data5, root1, 2, 3 + gs_butterfly data6, data7, root1, 4, 5 + + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root0, 4, 5 + gs_butterfly data5, data7, root0, 4, 5 + + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + // barrett_reduce data4 // JUST TEMPORARY for canonical output + // barrett_reduce data5 // JUST TEMPORARY for canonical output + // barrett_reduce data6 // JUST TEMPORARY for canonical output + // barrett_reduce data7 // JUST TEMPORARY for canonical output + str_vo data4, in, (4*(512/8)) + str_vo data5, in, (5*(512/8)) + str_vo data6, in, (6*(512/8)) + str_vo data7, in, (7*(512/8)) + + mul_ninv data4, data5, data6, data7, data0, data1, data2, data3 + + // barrett_reduce data4 // JUST TEMPORARY for canonical output + // barrett_reduce data5 // JUST TEMPORARY for canonical output + // barrett_reduce data6 // JUST TEMPORARY for canonical output + // barrett_reduce data7 // JUST TEMPORARY for canonical output + + str_vi data4, in, (16) + str_vo data5, in, (-16 + 1*(512/8)) + str_vo data6, in, (-16 + 2*(512/8)) + str_vo data7, in, (-16 + 3*(512/8)) + + + subs count, count, #1 + cbnz count, layer123_start + + pop_stack + ret diff --git a/tests/ntt_kyber/manual/intt_kyber_123_45_67_twiddles.s b/tests/ntt_kyber/manual/intt_kyber_123_45_67_twiddles.s new file mode 100644 index 0000000..1e50a61 --- /dev/null +++ b/tests/ntt_kyber/manual/intt_kyber_123_45_67_twiddles.s @@ -0,0 +1,494 @@ + +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +roots_l56: +.short -910 +.short -910 +.short -1227 +.short -1227 +.short 219 +.short 219 +.short 855 +.short 855 +.short -8957 +.short -8957 +.short -12078 +.short -12078 +.short 2156 +.short 2156 +.short 8416 +.short 8416 +.short 1175 +.short 1175 +.short 394 +.short 394 +.short -1029 +.short -1029 +.short -1212 +.short -1212 +.short 11566 +.short 11566 +.short 3878 +.short 3878 +.short -10129 +.short -10129 +.short -11930 +.short -11930 +.short -885 +.short -885 +.short 1219 +.short 1219 +.short 1455 +.short 1455 +.short 1607 +.short 1607 +.short -8711 +.short -8711 +.short 11999 +.short 11999 +.short 14322 +.short 14322 +.short 15818 +.short 15818 +.short -648 +.short -648 +.short -1481 +.short -1481 +.short 712 +.short 712 +.short 682 +.short 682 +.short -6378 +.short -6378 +.short -14578 +.short -14578 +.short 7008 +.short 7008 +.short 6713 +.short 6713 +.short -886 +.short -886 +.short 1179 +.short 1179 +.short -1026 +.short -1026 +.short -1092 +.short -1092 +.short -8721 +.short -8721 +.short 11605 +.short 11605 +.short -10099 +.short -10099 +.short -10749 +.short -10749 +.short 554 +.short 554 +.short -1143 +.short -1143 +.short -403 +.short -403 +.short 525 +.short 525 +.short 5453 +.short 5453 +.short -11251 +.short -11251 +.short -3967 +.short -3967 +.short 5168 +.short 5168 +.short 927 +.short 927 +.short -1534 +.short -1534 +.short 461 +.short 461 +.short -1438 +.short -1438 +.short 9125 +.short 9125 +.short -15099 +.short -15099 +.short 4538 +.short 4538 +.short -14155 +.short -14155 +.short 735 +.short 735 +.short -561 +.short -561 +.short -757 +.short -757 +.short -319 +.short -319 +.short 7235 +.short 7235 +.short -5522 +.short -5522 +.short -7451 +.short -7451 +.short -3140 +.short -3140 +.short 863 +.short 863 +.short 1230 +.short 1230 +.short 556 +.short 556 +.short -1063 +.short -1063 +.short 8495 +.short 8495 +.short 12107 +.short 12107 +.short 5473 +.short 5473 +.short -10463 +.short -10463 +.short -452 +.short -452 +.short -807 +.short -807 +.short -1435 +.short -1435 +.short 1010 +.short 1010 +.short -4449 +.short -4449 +.short -7943 +.short -7943 +.short -14125 +.short -14125 +.short 9942 +.short 9942 +.short -1645 +.short -1645 +.short 780 +.short 780 +.short 109 +.short 109 +.short 1031 +.short 1031 +.short -16192 +.short -16192 +.short 7678 +.short 7678 +.short 1073 +.short 1073 +.short 10148 +.short 10148 +.short 1239 +.short 1239 +.short -375 +.short -375 +.short 1292 +.short 1292 +.short -1584 +.short -1584 +.short 12196 +.short 12196 +.short -3691 +.short -3691 +.short 12717 +.short 12717 +.short -15592 +.short -15592 +.short 1414 +.short 1414 +.short -1320 +.short -1320 +.short -33 +.short -33 +.short 464 +.short 464 +.short 13918 +.short 13918 +.short -12993 +.short -12993 +.short -325 +.short -325 +.short 4567 +.short 4567 +.short -641 +.short -641 +.short 992 +.short 992 +.short 941 +.short 941 +.short 1021 +.short 1021 +.short -6309 +.short -6309 +.short 9764 +.short 9764 +.short 9262 +.short 9262 +.short 10050 +.short 10050 +.short -268 +.short -268 +.short -733 +.short -733 +.short 892 +.short 892 +.short -939 +.short -939 +.short -2638 +.short -2638 +.short -7215 +.short -7215 +.short 8780 +.short 8780 +.short -9243 +.short -9243 +.short -632 +.short -632 +.short 816 +.short 816 +.short 1352 +.short 1352 +.short -650 +.short -650 +.short -6221 +.short -6221 +.short 8032 +.short 8032 +.short 13308 +.short 13308 +.short -6398 +.short -6398 +.short 642 +.short 642 +.short -952 +.short -952 +.short 1540 +.short 1540 +.short -1651 +.short -1651 +.short 6319 +.short 6319 +.short -9371 +.short -9371 +.short 15159 +.short 15159 +.short -16251 +.short -16251 +.short -1461 +.short -1461 +.short 1482 +.short 1482 +.short 540 +.short 540 +.short 1626 +.short 1626 +.short -14381 +.short -14381 +.short 14588 +.short 14588 +.short 5315 +.short 5315 +.short 16005 +.short 16005 +.short 1274 +.short 1274 +.short 1052 +.short 1052 +.short 1025 +.short 1025 +.short -1197 +.short -1197 +.short 12540 +.short 12540 +.short 10355 +.short 10355 +.short 10089 +.short 10089 +.short -11782 +.short -11782 +.short 279 +.short 279 +.short 1173 +.short 1173 +.short -233 +.short -233 +.short 667 +.short 667 +.short 2746 +.short 2746 +.short 11546 +.short 11546 +.short -2293 +.short -2293 +.short 6565 +.short 6565 +.short 314 +.short 314 +.short -756 +.short -756 +.short 48 +.short 48 +.short -1409 +.short -1409 +.short 3091 +.short 3091 +.short -7441 +.short -7441 +.short 472 +.short 472 +.short -13869 +.short -13869 +.short 1573 +.short 1573 +.short 76 +.short 76 +.short -331 +.short -331 +.short -289 +.short -289 +.short 15483 +.short 15483 +.short 748 +.short 748 +.short -3258 +.short -3258 +.short -2845 +.short -2845 +.short -1100 +.short -1100 +.short -723 +.short -723 +.short 680 +.short 680 +.short 568 +.short 568 +.short -10828 +.short -10828 +.short -7117 +.short -7117 +.short 6693 +.short 6693 +.short 5591 +.short 5591 +.short 1041 +.short 1041 +.short -1637 +.short -1637 +.short -583 +.short -583 +.short -17 +.short -17 +.short 10247 +.short 10247 +.short -16113 +.short -16113 +.short -5739 +.short -5739 +.short -167 +.short -167 +roots_l34: +.short 1583 +.short 15582 +.short -821 +.short -8081 +.short 1355 +.short 13338 +.short 0 +.short 0 +.short -569 +.short -5601 +.short 450 +.short 4429 +.short 936 +.short 9213 +.short 0 +.short 0 +.short 69 +.short 679 +.short 447 +.short 4400 +.short -535 +.short -5266 +.short 0 +.short 0 +.short 543 +.short 5345 +.short 1235 +.short 12156 +.short -1426 +.short -14036 +.short 0 +.short 0 +.short -797 +.short -7845 +.short -1333 +.short -13121 +.short 1089 +.short 10719 +.short 0 +.short 0 +.short -193 +.short -1900 +.short -56 +.short -551 +.short 283 +.short 2786 +.short 0 +.short 0 +.short 1410 +.short 13879 +.short -1476 +.short -14529 +.short -1339 +.short -13180 +.short 0 +.short 0 +.short -1062 +.short -10453 +.short 882 +.short 8682 +.short -296 +.short -2914 +.short 0 +.short 0 +roots_l012: +// layer 0 root modified to include ninv +.short 266 // originally: 1600 +.short 2618 // originally: 15749 +.short 40 +.short 394 +.short 749 +.short 7373 +.short -848 +.short -8347 +.short 1432 +.short 14095 +.short -630 +.short -6201 +.short 687 +.short 6762 +.short 0 +.short 0 \ No newline at end of file From 819a6ef0dce9c2ae307f8cea83c6cfa70df1ae22 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Wed, 6 Mar 2024 15:43:59 +0100 Subject: [PATCH 06/18] Minor Dilithium test changes --- tests/ntt_dilithium/main.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/ntt_dilithium/main.c b/tests/ntt_dilithium/main.c index aad95cb..13893da 100644 --- a/tests/ntt_dilithium/main.c +++ b/tests/ntt_dilithium/main.c @@ -336,10 +336,10 @@ MAKE_TEST(asm_123_45678_w_scalar,0,ntt_dilithium_123_45678_w_scalar,ntt_u32_C,0, MAKE_TEST(asm_123_45678_manual_st4,0,ntt_dilithium_123_45678_manual_st4,ntt_u32_C,0,0) MAKE_TEST(asm_1234_5678,0,ntt_dilithium_1234_5678,ntt_u32_C,0,0) MAKE_TEST(asm_1234_5678_manual_st4,0,ntt_dilithium_1234_5678_manual_st4,ntt_u32_C,0,0) -MAKE_TEST(asm_1234_5678_inv,0,intt_dilithium_1234_5678,invntt_u32_tomont_C,0,1) -MAKE_TEST(asm_1234_5678_inv_manual_ld4,0,intt_dilithium_1234_5678_manual_ld4,invntt_u32_tomont_C,0,1) -MAKE_TEST(asm_123_45678_inv,0,intt_dilithium_123_45678,invntt_u32_tomont_C,0,1) -MAKE_TEST(asm_123_45678_inv_manual_ld4,0,intt_dilithium_123_45678_manual_ld4,invntt_u32_tomont_C,0,1) +MAKE_TEST(asm_1234_5678_inv,1,intt_dilithium_1234_5678,invntt_u32_tomont_C,0,1) +MAKE_TEST(asm_1234_5678_inv_manual_ld4,1,intt_dilithium_1234_5678_manual_ld4,invntt_u32_tomont_C,0,1) +MAKE_TEST(asm_123_45678_inv,1,intt_dilithium_123_45678,invntt_u32_tomont_C,0,1) +MAKE_TEST(asm_123_45678_inv_manual_ld4,1,intt_dilithium_123_45678_manual_ld4,invntt_u32_tomont_C,0,1) // A55 MAKE_TEST(asm_123_45678_opt_a55,0,ntt_dilithium_123_45678_opt_a55,ntt_u32_C,0,0) MAKE_TEST(asm_123_45678_manual_st4_opt_a55,0,ntt_dilithium_123_45678_manual_st4_opt_a55,ntt_u32_C,0,0) @@ -363,6 +363,7 @@ MAKE_TEST(asm_1234_5678_manual_st4_opt_m1_icestorm,0,ntt_dilithium_1234_5678_man // Other MAKE_TEST(neonntt_fwd,0,ntt,ntt_u32_C,0,0) MAKE_TEST(pqclean_ntt_fwd,0,pqclean_ntt,ntt_u32_C,0,0) +MAKE_TEST(neonntt_inv,1,invntt_tomont,invntt_u32_tomont_C,0,0) uint64_t t0, t1; uint64_t cycles[TEST_COUNT]; @@ -586,6 +587,11 @@ int main( void ) { return 1; } + if (test_ntt_neonntt_inv() != 0) + { + return 1; + } + return(0); } From 2cb82611316c52a83ffdecb4dc2d344e5c497a18 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Fri, 8 Mar 2024 15:51:25 +0100 Subject: [PATCH 07/18] Kyber clean invNTT * Fix reductions * Add test against neon-ntt --- tests/ntt_kyber/main.c | 52 +++++++++++--------- tests/ntt_kyber/manual/intt_kyber_123_4567.s | 21 +++----- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/tests/ntt_kyber/main.c b/tests/ntt_kyber/main.c index c4ab12f..d017164 100644 --- a/tests/ntt_kyber/main.c +++ b/tests/ntt_kyber/main.c @@ -319,7 +319,7 @@ void buf_bitrev_4( int16_t *src ) } } -#define MAKE_TEST_FWD(var,inv,func,ref_func,rev4,reduction_included) \ +#define MAKE_TEST_FWD(var,inv,func,ref_func,rev4,reduction_included,canonical_result) \ int test_ntt_ ## var () \ { \ debug_printf( "test ntt_kyber %-50s ", #func "\0"); \ @@ -332,8 +332,9 @@ int test_ntt_ ## var () \ \ /* Step 1: Reference NTT */ \ memcpy( src_copy, src, sizeof( src ) ); \ - (ref_func)( src_copy ); \ - mod_reduce_buf_s16_signed( src_copy, NTT_SIZE, modulus ); \ + (ref_func)( src_copy ); \ + if (canonical_result) \ + mod_reduce_buf_s16_signed( src_copy, NTT_SIZE, modulus ); \ \ if( rev4 ) \ buf_bitrev_4( src_copy ); \ @@ -355,25 +356,27 @@ int test_ntt_ ## var () \ return( 0 ); \ } // Clean -MAKE_TEST_FWD(asm, 0, ntt_kyber_123_4567, ntt_ct,0,1) -MAKE_TEST_FWD(asm_123_4567_scalar_load, 0, ntt_kyber_123_4567_scalar_load, ntt_ct,0,1) -MAKE_TEST_FWD(asm_123_4567_scalar_load_store, 0, ntt_kyber_123_4567_scalar_load_store, ntt_ct,0,1) -MAKE_TEST_FWD(asm_123_4567_scalar_store, 0, ntt_kyber_123_4567_scalar_store, ntt_ct,0,1) -MAKE_TEST_FWD(asm_1234_567, 0, ntt_kyber_1234_567, ntt_ct,0,1) -// the output is small, but not canonically reduced. Same as NEON NTT paper -MAKE_TEST_FWD(asm_123_4567_inv, 1, intt_kyber_123_4567, invntt_gs,0,0) +MAKE_TEST_FWD(asm, 0, ntt_kyber_123_4567, ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_scalar_load, 0, ntt_kyber_123_4567_scalar_load, ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_scalar_load_store, 0, ntt_kyber_123_4567_scalar_load_store, ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_scalar_store, 0, ntt_kyber_123_4567_scalar_store, ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_1234_567, 0, ntt_kyber_1234_567, ntt_ct,0,1,1) +// Clean invNTT +MAKE_TEST_FWD(asm_123_4567_inv, 1, intt_kyber_123_4567, invntt_gs,0,0,1) +// Check against neon-ntt for comparability +MAKE_TEST_FWD(asm_vs_neonntt_123_4567_inv, 1, intt_kyber_123_4567, invntt,0,1,0) // A55 -MAKE_TEST_FWD(asm_123_4567_manual_st4_opt_a55, 0, ntt_kyber_123_4567_manual_st4_opt_a55, ntt_ct,0,1) -MAKE_TEST_FWD(asm_123_4567_opt_a55, 0, ntt_kyber_123_4567_opt_a55, ntt_ct,0,1) -MAKE_TEST_FWD(asm_123_4567_scalar_load_opt_a55, 0, ntt_kyber_123_4567_scalar_load_opt_a55, ntt_ct,0,1) -MAKE_TEST_FWD(asm_123_4567_scalar_load_store_opt_a55, 0, ntt_kyber_123_4567_scalar_load_store_opt_a55, ntt_ct,0,1) -MAKE_TEST_FWD(asm_123_4567_scalar_store_opt_a55, 0, ntt_kyber_123_4567_scalar_store_opt_a55, ntt_ct,0,1) +MAKE_TEST_FWD(asm_123_4567_manual_st4_opt_a55, 0, ntt_kyber_123_4567_manual_st4_opt_a55, ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_opt_a55, 0, ntt_kyber_123_4567_opt_a55, ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_scalar_load_opt_a55, 0, ntt_kyber_123_4567_scalar_load_opt_a55, ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_scalar_load_store_opt_a55, 0, ntt_kyber_123_4567_scalar_load_store_opt_a55, ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_scalar_store_opt_a55, 0, ntt_kyber_123_4567_scalar_store_opt_a55, ntt_ct,0,1,1) // A72 -MAKE_TEST_FWD(asm_123_4567_manual_st4_opt_a72, 0, ntt_kyber_123_4567_manual_st4_opt_a72, ntt_ct,0,1) -MAKE_TEST_FWD(asm_123_4567_opt_a72, 0, ntt_kyber_123_4567_opt_a72, ntt_ct,0,1) -MAKE_TEST_FWD(asm_123_4567_scalar_load_opt_a72, 0, ntt_kyber_123_4567_scalar_load_opt_a72, ntt_ct,0,1) -MAKE_TEST_FWD(asm_123_4567_scalar_load_store_opt_a72, 0, ntt_kyber_123_4567_scalar_load_store_opt_a72, ntt_ct,0,1) -MAKE_TEST_FWD(asm_123_4567_scalar_store_opt_a72, 0, ntt_kyber_123_4567_scalar_store_opt_a72, ntt_ct,0,1) +MAKE_TEST_FWD(asm_123_4567_manual_st4_opt_a72, 0, ntt_kyber_123_4567_manual_st4_opt_a72, ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_opt_a72, 0, ntt_kyber_123_4567_opt_a72, ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_scalar_load_opt_a72, 0, ntt_kyber_123_4567_scalar_load_opt_a72, ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_scalar_load_store_opt_a72, 0, ntt_kyber_123_4567_scalar_load_store_opt_a72, ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_scalar_store_opt_a72, 0, ntt_kyber_123_4567_scalar_store_opt_a72, ntt_ct,0,1,1) // M1 Firestorm MAKE_TEST_FWD(asm_123_4567_opt_m1_firestorm, ntt_kyber_123_4567_opt_m1_firestorm,0,1) MAKE_TEST_FWD(asm_123_4567_scalar_load_opt_m1_firestorm, ntt_kyber_123_4567_scalar_load_opt_m1_firestorm,0,1) @@ -391,9 +394,9 @@ MAKE_TEST_FWD(asm_123_4567_scalar_store_opt_m1_icestorm, ntt_kyber_123_4567_scal /* MAKE_TEST_FWD(asm_1234_567_opt_m1_icestorm, ntt_kyber_1234_567_opt_m1_icestorm,0,1) */ /* MAKE_TEST_FWD(asm_1234_567_manual_st4_opt_m1_icestorm, ntt_kyber_1234_567_manual_st4_opt_m1_icestorm,0,1) */ // other -MAKE_TEST_FWD(neonntt, 0, ntt, ntt_ct,0,1) +MAKE_TEST_FWD(neonntt, 0, ntt, ntt_ct,0,1,1) MAKE_TEST_FWD(pqclean,pqclean_ntt,0,1) -MAKE_TEST_FWD(neonntt_inv, 1, invntt, invntt_gs,0,0) +MAKE_TEST_FWD(neonntt_inv, 1, invntt, invntt_gs,0,0,1) uint64_t t0, t1; uint64_t cycles[TEST_COUNT]; @@ -503,6 +506,11 @@ int main( void ) return (1); } + if (test_ntt_asm_vs_neonntt_123_4567_inv() != 0) + { + return (1); + } + if (test_ntt_asm_123_4567_manual_st4_opt_a55() != 0) { return (1); diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567.s b/tests/ntt_kyber/manual/intt_kyber_123_4567.s index 0646af5..0a16d02 100644 --- a/tests/ntt_kyber/manual/intt_kyber_123_4567.s +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567.s @@ -378,15 +378,14 @@ layer4567_start: // Layer 5 gs_butterfly data0, data1, root0, 2, 3 gs_butterfly data2, data3, root0, 4, 5 + + barrett_reduce data0 + barrett_reduce data2 + // Layer 4 gs_butterfly data0, data2, root0, 0, 1 gs_butterfly data1, data3, root0, 0, 1 - // or Montgomery? - barrett_reduce data0 - barrett_reduce data1 - barrett_reduce data2 - barrett_reduce data3 str_vi data0, inp, (64) str_vo data1, inp, (-64 + 16*1) str_vo data2, inp, (-64 + 16*2) @@ -432,15 +431,14 @@ layer123_start: gs_butterfly data4, data6, root0, 4, 5 gs_butterfly data5, data7, root0, 4, 5 + barrett_reduce data0 + barrett_reduce data4 + gs_butterfly data0, data4, root0, 0, 1 gs_butterfly data1, data5, root0, 0, 1 gs_butterfly data2, data6, root0, 0, 1 gs_butterfly data3, data7, root0, 0, 1 - // barrett_reduce data4 // JUST TEMPORARY for canonical output - // barrett_reduce data5 // JUST TEMPORARY for canonical output - // barrett_reduce data6 // JUST TEMPORARY for canonical output - // barrett_reduce data7 // JUST TEMPORARY for canonical output str_vo data4, in, (4*(512/8)) str_vo data5, in, (5*(512/8)) str_vo data6, in, (6*(512/8)) @@ -448,11 +446,6 @@ layer123_start: mul_ninv data4, data5, data6, data7, data0, data1, data2, data3 - // barrett_reduce data4 // JUST TEMPORARY for canonical output - // barrett_reduce data5 // JUST TEMPORARY for canonical output - // barrett_reduce data6 // JUST TEMPORARY for canonical output - // barrett_reduce data7 // JUST TEMPORARY for canonical output - str_vi data4, in, (16) str_vo data5, in, (-16 + 1*(512/8)) str_vo data6, in, (-16 + 2*(512/8)) From 741f3c70a378997a2411d0ad47dd80403640b35e Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Thu, 14 Mar 2024 11:40:38 +0100 Subject: [PATCH 08/18] Adapt test macro syntax --- tests/ntt_kyber/main.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/ntt_kyber/main.c b/tests/ntt_kyber/main.c index d017164..de3da27 100644 --- a/tests/ntt_kyber/main.c +++ b/tests/ntt_kyber/main.c @@ -378,24 +378,24 @@ MAKE_TEST_FWD(asm_123_4567_scalar_load_opt_a72, 0, ntt_kyber_123_4567_scalar_loa MAKE_TEST_FWD(asm_123_4567_scalar_load_store_opt_a72, 0, ntt_kyber_123_4567_scalar_load_store_opt_a72, ntt_ct,0,1,1) MAKE_TEST_FWD(asm_123_4567_scalar_store_opt_a72, 0, ntt_kyber_123_4567_scalar_store_opt_a72, ntt_ct,0,1,1) // M1 Firestorm -MAKE_TEST_FWD(asm_123_4567_opt_m1_firestorm, ntt_kyber_123_4567_opt_m1_firestorm,0,1) -MAKE_TEST_FWD(asm_123_4567_scalar_load_opt_m1_firestorm, ntt_kyber_123_4567_scalar_load_opt_m1_firestorm,0,1) +MAKE_TEST_FWD(asm_123_4567_opt_m1_firestorm, 0, ntt_kyber_123_4567_opt_m1_firestorm,ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_scalar_load_opt_m1_firestorm, 0, ntt_kyber_123_4567_scalar_load_opt_m1_firestorm,ntt_ct,0,1,1) /* MAKE_TEST_FWD(asm_1234_567_opt_m1_firestorm, ntt_kyber_1234_567_opt_m1_firestorm,0,1) */ -MAKE_TEST_FWD(asm_123_4567_scalar_load_store_opt_m1_firestorm, ntt_kyber_123_4567_scalar_load_store_opt_m1_firestorm,0,1) -MAKE_TEST_FWD(asm_123_4567_manual_st4_opt_m1_firestorm, ntt_kyber_123_4567_manual_st4_opt_m1_firestorm,0,1) -MAKE_TEST_FWD(asm_123_4567_scalar_store_opt_m1_firestorm, ntt_kyber_123_4567_scalar_store_opt_m1_firestorm,0,1) +MAKE_TEST_FWD(asm_123_4567_scalar_load_store_opt_m1_firestorm, 0, ntt_kyber_123_4567_scalar_load_store_opt_m1_firestorm,ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_manual_st4_opt_m1_firestorm, 0, ntt_kyber_123_4567_manual_st4_opt_m1_firestorm,ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_scalar_store_opt_m1_firestorm, 0, ntt_kyber_123_4567_scalar_store_opt_m1_firestorm,ntt_ct,0,1,1) /* MAKE_TEST_FWD(asm_1234_567_manual_st4_opt_m1_firestorm, ntt_kyber_1234_567_manual_st4_opt_m1_firestorm,0,1) */ // M1 Icestorm -MAKE_TEST_FWD(asm_123_4567_manual_st4_opt_m1_icestorm, ntt_kyber_123_4567_manual_st4_opt_m1_icestorm,0,1) -MAKE_TEST_FWD(asm_123_4567_opt_m1_icestorm, ntt_kyber_123_4567_opt_m1_icestorm,0,1) -MAKE_TEST_FWD(asm_123_4567_scalar_load_opt_m1_icestorm, ntt_kyber_123_4567_scalar_load_opt_m1_icestorm,0,1) -MAKE_TEST_FWD(asm_123_4567_scalar_load_store_opt_m1_icestorm, ntt_kyber_123_4567_scalar_load_store_opt_m1_icestorm,0,1) -MAKE_TEST_FWD(asm_123_4567_scalar_store_opt_m1_icestorm, ntt_kyber_123_4567_scalar_store_opt_m1_icestorm,0,1) +MAKE_TEST_FWD(asm_123_4567_manual_st4_opt_m1_icestorm, 0, ntt_kyber_123_4567_manual_st4_opt_m1_icestorm,ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_opt_m1_icestorm, 0, ntt_kyber_123_4567_opt_m1_icestorm,ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_scalar_load_opt_m1_icestorm, 0, ntt_kyber_123_4567_scalar_load_opt_m1_icestorm,ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_scalar_load_store_opt_m1_icestorm, 0, ntt_kyber_123_4567_scalar_load_store_opt_m1_icestorm,ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_scalar_store_opt_m1_icestorm, 0, ntt_kyber_123_4567_scalar_store_opt_m1_icestorm,ntt_ct,0,1,1) /* MAKE_TEST_FWD(asm_1234_567_opt_m1_icestorm, ntt_kyber_1234_567_opt_m1_icestorm,0,1) */ /* MAKE_TEST_FWD(asm_1234_567_manual_st4_opt_m1_icestorm, ntt_kyber_1234_567_manual_st4_opt_m1_icestorm,0,1) */ // other MAKE_TEST_FWD(neonntt, 0, ntt, ntt_ct,0,1,1) -MAKE_TEST_FWD(pqclean,pqclean_ntt,0,1) +MAKE_TEST_FWD(pqclean, 0, pqclean_ntt, ntt_ct,0,1,1) MAKE_TEST_FWD(neonntt_inv, 1, invntt, invntt_gs,0,0,1) uint64_t t0, t1; From b8d4016d523ad32a5d0b456f5eda7f9395abbe2c Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Thu, 14 Mar 2024 11:49:01 +0100 Subject: [PATCH 09/18] Add Kyber invNTT manual_ld4 --- .../intt_kyber_123_4567_manual_ld4.s | 1 + tests/ntt_kyber/main.c | 9 + .../manual/intt_kyber_123_4567_manual_ld4.s | 454 ++++++++++++++++++ 3 files changed, 464 insertions(+) create mode 120000 asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4.s create mode 100644 tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4.s diff --git a/asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4.s b/asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4.s new file mode 120000 index 0000000..fdef103 --- /dev/null +++ b/asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4.s @@ -0,0 +1 @@ +../../../slothy/examples/naive/aarch64/intt_kyber_123_4567_manual_ld4.s \ No newline at end of file diff --git a/tests/ntt_kyber/main.c b/tests/ntt_kyber/main.c index de3da27..ef5833e 100644 --- a/tests/ntt_kyber/main.c +++ b/tests/ntt_kyber/main.c @@ -44,6 +44,7 @@ // base void ntt_kyber_123_4567(int16_t *); void intt_kyber_123_4567(int16_t *); +void intt_kyber_123_4567_manual_ld4(int16_t *); void ntt_kyber_123_4567_scalar_load(int16_t *); void ntt_kyber_123_4567_scalar_load_store(int16_t *); void ntt_kyber_123_4567_scalar_store(int16_t *); @@ -363,6 +364,7 @@ MAKE_TEST_FWD(asm_123_4567_scalar_store, 0, ntt_kyber_123_4567_scalar_store, ntt MAKE_TEST_FWD(asm_1234_567, 0, ntt_kyber_1234_567, ntt_ct,0,1,1) // Clean invNTT MAKE_TEST_FWD(asm_123_4567_inv, 1, intt_kyber_123_4567, invntt_gs,0,0,1) +MAKE_TEST_FWD(asm_123_4567_inv_manual_ld4, 1, intt_kyber_123_4567_manual_ld4, invntt_gs,0,0,1) // Check against neon-ntt for comparability MAKE_TEST_FWD(asm_vs_neonntt_123_4567_inv, 1, intt_kyber_123_4567, invntt,0,1,0) // A55 @@ -434,6 +436,7 @@ MAKE_BENCH(asm_123_4567_scalar_load_store, ntt_kyber_123_4567_scalar_load_store) MAKE_BENCH(asm_123_4567_scalar_store, ntt_kyber_123_4567_scalar_store) MAKE_BENCH(asm_1234_567, ntt_kyber_1234_567) MAKE_BENCH(asm_123_4567_inv, intt_kyber_123_4567) +MAKE_BENCH(asm_123_4567_inv_manual_ld4, intt_kyber_123_4567_manual_ld4) // A55 MAKE_BENCH(asm_123_4567_manual_st4_opt_a55, ntt_kyber_123_4567_manual_st4_opt_a55) MAKE_BENCH(asm_123_4567_opt_a55, ntt_kyber_123_4567_opt_a55) @@ -506,6 +509,11 @@ int main( void ) return (1); } + if (test_ntt_asm_123_4567_inv_manual_ld4() != 0) + { + return (1); + } + if (test_ntt_asm_vs_neonntt_123_4567_inv() != 0) { return (1); @@ -602,6 +610,7 @@ int main( void ) bench_ntt_asm_123_4567_scalar_store(); bench_ntt_asm_1234_567(); bench_ntt_asm_123_4567_inv(); + bench_ntt_asm_123_4567_inv_manual_ld4(); /* A55 */ bench_ntt_asm_123_4567_manual_st4_opt_a55(); bench_ntt_asm_123_4567_opt_a55(); diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4.s new file mode 100644 index 0000000..478c7d1 --- /dev/null +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4.s @@ -0,0 +1,454 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. +// +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0\().4s, \data\()1\().4s + trn2 t1.4s, \data\()0\().4s, \data\()1\().4s + trn1 t2.4s, \data\()2\().4s, \data\()3\().4s + trn2 t3.4s, \data\()2\().4s, \data\()3\().4s + + trn2 \data\()2\().2d, t0.2d, t2.2d + trn2 \data\()3\().2d, t1.2d, t3.2d + trn1 \data\()0\().2d, t0.2d, t2.2d + trn1 \data\()1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0\().4s, \data_in\()0\().4s, \data_in\()1\().4s + trn2 \data_out\()1\().4s, \data_in\()0\().4s, \data_in\()1\().4s + trn1 \data_out\()2\().4s, \data_in\()2\().4s, \data_in\()3\().4s + trn2 \data_out\()3\().4s, \data_in\()2\().4s, \data_in\()3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_manual_ld4 + .global _intt_kyber_123_4567_manual_ld4 + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_manual_ld4: +_intt_kyber_123_4567_manual_ld4: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 +layer4567_start: + ld4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp] + + load_next_roots_67 + + // Layer 7 + gs_butterfly_v data0, data1, root1, root1_tw + gs_butterfly_v data2, data3, root2, root2_tw + // Layer 6 + gs_butterfly_v data0, data2, root0, root0_tw + gs_butterfly_v data1, data3, root0, root0_tw + + transpose4 data + + load_next_roots_45 + + // Layer 5 + gs_butterfly data0, data1, root0, 2, 3 + gs_butterfly data2, data3, root0, 4, 5 + + barrett_reduce data0 + barrett_reduce data2 + + // Layer 4 + gs_butterfly data0, data2, root0, 0, 1 + gs_butterfly data1, data3, root0, 0, 1 + + str_vi data0, inp, (64) + str_vo data1, inp, (-64 + 16*1) + str_vo data2, inp, (-64 + 16*2) + str_vo data3, inp, (-64 + 16*3) + + subs count, count, #1 + cbnz count, layer4567_start + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + +layer123_start: + + ldr_vo data0, in, 0 + ldr_vo data1, in, (1*(512/8)) + ldr_vo data2, in, (2*(512/8)) + ldr_vo data3, in, (3*(512/8)) + ldr_vo data4, in, (4*(512/8)) + ldr_vo data5, in, (5*(512/8)) + ldr_vo data6, in, (6*(512/8)) + ldr_vo data7, in, (7*(512/8)) + + gs_butterfly data0, data1, root0, 6, 7 + gs_butterfly data2, data3, root1, 0, 1 + gs_butterfly data4, data5, root1, 2, 3 + gs_butterfly data6, data7, root1, 4, 5 + + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root0, 4, 5 + gs_butterfly data5, data7, root0, 4, 5 + + barrett_reduce data0 + barrett_reduce data4 + + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + str_vo data4, in, (4*(512/8)) + str_vo data5, in, (5*(512/8)) + str_vo data6, in, (6*(512/8)) + str_vo data7, in, (7*(512/8)) + + mul_ninv data4, data5, data6, data7, data0, data1, data2, data3 + + str_vi data4, in, (16) + str_vo data5, in, (-16 + 1*(512/8)) + str_vo data6, in, (-16 + 2*(512/8)) + str_vo data7, in, (-16 + 3*(512/8)) + + + subs count, count, #1 + cbnz count, layer123_start + + pop_stack + ret From bdd2a1253c76ef3ce810f1a2b746da55d65ec7e7 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 18 Mar 2024 10:31:18 +0100 Subject: [PATCH 10/18] Fix Kyber invNTT macro syntax --- tests/ntt_kyber/manual/intt_kyber_123_4567.s | 26 +++++++++---------- .../manual/intt_kyber_123_4567_manual_ld4.s | 26 +++++++++---------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567.s b/tests/ntt_kyber/manual/intt_kyber_123_4567.s index 0a16d02..6dda0d2 100644 --- a/tests/ntt_kyber/manual/intt_kyber_123_4567.s +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567.s @@ -128,22 +128,22 @@ .endm .macro transpose4 data - trn1 t0.4s, \data\()0\().4s, \data\()1\().4s - trn2 t1.4s, \data\()0\().4s, \data\()1\().4s - trn1 t2.4s, \data\()2\().4s, \data\()3\().4s - trn2 t3.4s, \data\()2\().4s, \data\()3\().4s - - trn2 \data\()2\().2d, t0.2d, t2.2d - trn2 \data\()3\().2d, t1.2d, t3.2d - trn1 \data\()0\().2d, t0.2d, t2.2d - trn1 \data\()1\().2d, t1.2d, t3.2d + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d .endm .macro transpose_single data_out, data_in - trn1 \data_out\()0\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn2 \data_out\()1\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn1 \data_out\()2\().4s, \data_in\()2\().4s, \data_in\()3\().4s - trn2 \data_out\()3\().4s, \data_in\()2\().4s, \data_in\()3\().4s + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm .macro save_gprs // slothy:no-unfold diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4.s index 478c7d1..0285f66 100644 --- a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4.s +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4.s @@ -128,22 +128,22 @@ .endm .macro transpose4 data - trn1 t0.4s, \data\()0\().4s, \data\()1\().4s - trn2 t1.4s, \data\()0\().4s, \data\()1\().4s - trn1 t2.4s, \data\()2\().4s, \data\()3\().4s - trn2 t3.4s, \data\()2\().4s, \data\()3\().4s - - trn2 \data\()2\().2d, t0.2d, t2.2d - trn2 \data\()3\().2d, t1.2d, t3.2d - trn1 \data\()0\().2d, t0.2d, t2.2d - trn1 \data\()1\().2d, t1.2d, t3.2d + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d .endm .macro transpose_single data_out, data_in - trn1 \data_out\()0\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn2 \data_out\()1\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn1 \data_out\()2\().4s, \data_in\()2\().4s, \data_in\()3\().4s - trn2 \data_out\()3\().4s, \data_in\()2\().4s, \data_in\()3\().4s + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm .macro save_gprs // slothy:no-unfold From ae2e167401ec8d804b59ff4040ccd9233c5da477 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Wed, 20 Mar 2024 15:33:26 +0100 Subject: [PATCH 11/18] Optimized Dilithium invNTTs --- ...t_dilithium_1234_5678_manual_ld4_opt_a55.s | 1 + ...t_dilithium_1234_5678_manual_ld4_opt_a72.s | 1 + .../intt_dilithium_1234_5678_opt_a55.s | 1 + .../intt_dilithium_1234_5678_opt_a72.s | 1 + ...ntt_dilithium_1234_5678_opt_m1_firestorm.s | 1 + ...t_dilithium_123_45678_manual_ld4_opt_a55.s | 1 + ...t_dilithium_123_45678_manual_ld4_opt_a72.s | 1 + ...um_123_45678_manual_ld4_opt_m1_firestorm.s | 1 + .../intt_dilithium_123_45678_opt_a55.s | 1 + .../intt_dilithium_123_45678_opt_a72.s | 1 + ...ntt_dilithium_123_45678_opt_m1_firestorm.s | 1 + tests/ntt_dilithium/main.c | 228 +- ...t_dilithium_1234_5678_manual_ld4_opt_a55.s | 1746 ++++++++++++ ...t_dilithium_1234_5678_manual_ld4_opt_a72.s | 1810 +++++++++++++ .../manual/intt_dilithium_1234_5678_opt_a55.s | 1718 ++++++++++++ .../manual/intt_dilithium_1234_5678_opt_a72.s | 1794 ++++++++++++ ...ntt_dilithium_1234_5678_opt_m1_firestorm.s | 2096 +++++++++++++++ .../manual/intt_dilithium_123_45678.s | 26 +- .../intt_dilithium_123_45678_manual_ld4.s | 26 +- ...t_dilithium_123_45678_manual_ld4_opt_a55.s | 2136 +++++++++++++++ ...t_dilithium_123_45678_manual_ld4_opt_a72.s | 2395 +++++++++++++++++ ...um_123_45678_manual_ld4_opt_m1_firestorm.s | 2292 ++++++++++++++++ .../manual/intt_dilithium_123_45678_opt_a55.s | 2038 ++++++++++++++ .../manual/intt_dilithium_123_45678_opt_a72.s | 2327 ++++++++++++++++ ...ntt_dilithium_123_45678_opt_m1_firestorm.s | 2216 +++++++++++++++ tests/ntt_dilithium/manual/pqclean.h | 2 +- 26 files changed, 22820 insertions(+), 41 deletions(-) create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4_opt_a55.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4_opt_a72.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_1234_5678_opt_a55.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_1234_5678_opt_a72.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_1234_5678_opt_m1_firestorm.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4_opt_a55.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4_opt_a72.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_123_45678_opt_a55.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_123_45678_opt_a72.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_123_45678_opt_m1_firestorm.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_a55.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_a72.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_a55.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_a72.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_m1_firestorm.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a55.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a72.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a55.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a72.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_firestorm.s diff --git a/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4_opt_a55.s b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4_opt_a55.s new file mode 120000 index 0000000..120f1ca --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4_opt_a55.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_a55.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4_opt_a72.s b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4_opt_a72.s new file mode 120000 index 0000000..0357df6 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4_opt_a72.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_a72.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_opt_a55.s b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_opt_a55.s new file mode 120000 index 0000000..724b5c0 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_opt_a55.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_dilithium_1234_5678_opt_a55.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_opt_a72.s b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_opt_a72.s new file mode 120000 index 0000000..51f6b85 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_opt_a72.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_dilithium_1234_5678_opt_a72.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_opt_m1_firestorm.s b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_opt_m1_firestorm.s new file mode 120000 index 0000000..d020ed9 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_opt_m1_firestorm.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_dilithium_1234_5678_opt_m1_firestorm.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4_opt_a55.s b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4_opt_a55.s new file mode 120000 index 0000000..5dd70f4 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4_opt_a55.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_a55.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4_opt_a72.s b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4_opt_a72.s new file mode 120000 index 0000000..edfd946 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4_opt_a72.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_a72.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s new file mode 120000 index 0000000..6962ee7 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_123_45678_opt_a55.s b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_opt_a55.s new file mode 120000 index 0000000..969f912 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_opt_a55.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_dilithium_123_45678_opt_a55.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_123_45678_opt_a72.s b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_opt_a72.s new file mode 120000 index 0000000..a49d823 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_opt_a72.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_dilithium_123_45678_opt_a72.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_123_45678_opt_m1_firestorm.s b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_opt_m1_firestorm.s new file mode 120000 index 0000000..7c3f338 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_opt_m1_firestorm.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_dilithium_123_45678_opt_m1_firestorm.s \ No newline at end of file diff --git a/tests/ntt_dilithium/main.c b/tests/ntt_dilithium/main.c index 13893da..58f1488 100644 --- a/tests/ntt_dilithium/main.c +++ b/tests/ntt_dilithium/main.c @@ -53,22 +53,38 @@ void intt_dilithium_123_45678_manual_ld4(int32_t *); void ntt_dilithium_123_45678_opt_a55(int32_t *); void ntt_dilithium_123_45678_manual_st4_opt_a55(int32_t *); void ntt_dilithium_123_45678_w_scalar_opt_a55(int32_t *); +void intt_dilithium_123_45678_opt_a55(int32_t *); +void intt_dilithium_123_45678_manual_ld4_opt_a55(int32_t *); +void intt_dilithium_1234_5678_opt_a55(int32_t *); +void intt_dilithium_1234_5678_manual_ld4_opt_a55(int32_t *); // A72 void ntt_dilithium_123_45678_opt_a72(int32_t *); void ntt_dilithium_123_45678_manual_st4_opt_a72(int32_t *); void ntt_dilithium_1234_5678_opt_a72(int32_t *); +void intt_dilithium_123_45678_opt_a72(int32_t *); +void intt_dilithium_123_45678_manual_ld4_opt_a72(int32_t *); +void intt_dilithium_1234_5678_opt_a72(int32_t *); +void intt_dilithium_1234_5678_manual_ld4_opt_a72(int32_t *); // M1 Firestorm void ntt_dilithium_123_45678_manual_st4_opt_m1_firestorm(int32_t *); void ntt_dilithium_123_45678_opt_m1_firestorm(int32_t *); /* void ntt_dilithium_123_45678_w_scalar_opt_m1_firestorm(int32_t *); */ void ntt_dilithium_1234_5678_opt_m1_firestorm(int32_t *); void ntt_dilithium_1234_5678_manual_st4_opt_m1_firestorm(int32_t *); +void intt_dilithium_123_45678_opt_m1_firestorm(int32_t *); +void intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm(int32_t *); +void intt_dilithium_1234_5678_opt_m1_firestorm(int32_t *); +/* void intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm(int32_t *); */ // not done yet // M1 Icestorm void ntt_dilithium_123_45678_manual_st4_opt_m1_icestorm(int32_t *); void ntt_dilithium_123_45678_opt_m1_icestorm(int32_t *); void ntt_dilithium_123_45678_w_scalar_opt_m1_icestorm(int32_t *); void ntt_dilithium_1234_5678_opt_m1_icestorm(int32_t *); void ntt_dilithium_1234_5678_manual_st4_opt_m1_icestorm(int32_t *); +/* void intt_dilithium_123_45678_opt_m1_icestorm(int32_t *); +void intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm(int32_t *); +void intt_dilithium_1234_5678_opt_m1_icestorm(int32_t *); +void intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm(int32_t *); */ // not done yet #define NTT_LAYERS 8 #define NTT_SIZE (1u << NTT_LAYERS) @@ -290,7 +306,7 @@ void buf_bitrev_4( int32_t *src, size_t size ) } } -#define MAKE_TEST(var,inv,func,ref_func,rev4,includes_reduction) \ +#define MAKE_TEST(var,inv,func,ref_func,rev4,includes_reduction,canonical_result)\ int test_ntt_ ## var () \ { \ debug_printf( "test ntt_dilithium %-50s ", #func "\0"); \ @@ -299,14 +315,15 @@ int test_ntt_ ## var () \ int32_t src_copy[NTT_SIZE] __attribute__((aligned(16))); \ \ /* Setup input */ \ - /*fill_random_u32( (uint32_t*) src, NTT_SIZE );*/ \ - for(uint32_t i = 0; i< NTT_SIZE; i++){src[i] = (i * i * 137 + 1234) % modulus;} \ + fill_random_u32( (uint32_t*) src, NTT_SIZE ); \ + /* for(uint32_t i = 0; i< NTT_SIZE; i++){src[i] = (i * i * 137 + 1234) % modulus;} */ \ mod_reduce_buf_s32( src, NTT_SIZE, modulus ); \ \ /* Step 1: Reference NTT */ \ memcpy( src_copy, src, sizeof( src ) ); \ (ref_func)( src_copy ); \ - mod_reduce_buf_s32_signed( src_copy, NTT_SIZE, modulus ); \ + if (canonical_result) \ + mod_reduce_buf_s32_signed( src_copy, NTT_SIZE, modulus ); \ \ if( rev4 && !inv ) \ buf_bitrev_4( src_copy, NTT_SIZE ); \ @@ -331,39 +348,65 @@ int test_ntt_ ## var () \ } // base -MAKE_TEST(asm_123_45678,0,ntt_dilithium_123_45678,ntt_u32_C,0,0) -MAKE_TEST(asm_123_45678_w_scalar,0,ntt_dilithium_123_45678_w_scalar,ntt_u32_C,0,0) -MAKE_TEST(asm_123_45678_manual_st4,0,ntt_dilithium_123_45678_manual_st4,ntt_u32_C,0,0) -MAKE_TEST(asm_1234_5678,0,ntt_dilithium_1234_5678,ntt_u32_C,0,0) -MAKE_TEST(asm_1234_5678_manual_st4,0,ntt_dilithium_1234_5678_manual_st4,ntt_u32_C,0,0) -MAKE_TEST(asm_1234_5678_inv,1,intt_dilithium_1234_5678,invntt_u32_tomont_C,0,1) -MAKE_TEST(asm_1234_5678_inv_manual_ld4,1,intt_dilithium_1234_5678_manual_ld4,invntt_u32_tomont_C,0,1) -MAKE_TEST(asm_123_45678_inv,1,intt_dilithium_123_45678,invntt_u32_tomont_C,0,1) -MAKE_TEST(asm_123_45678_inv_manual_ld4,1,intt_dilithium_123_45678_manual_ld4,invntt_u32_tomont_C,0,1) +MAKE_TEST(asm_123_45678,0,ntt_dilithium_123_45678,ntt_u32_C,0,0,1) +MAKE_TEST(asm_123_45678_w_scalar,0,ntt_dilithium_123_45678_w_scalar,ntt_u32_C,0,0,1) +MAKE_TEST(asm_123_45678_manual_st4,0,ntt_dilithium_123_45678_manual_st4,ntt_u32_C,0,0,1) +MAKE_TEST(asm_1234_5678,0,ntt_dilithium_1234_5678,ntt_u32_C,0,0,1) +MAKE_TEST(asm_1234_5678_manual_st4,0,ntt_dilithium_1234_5678_manual_st4,ntt_u32_C,0,0,1) +MAKE_TEST(asm_1234_5678_inv,1,intt_dilithium_1234_5678,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_1234_5678_inv_manual_ld4,1,intt_dilithium_1234_5678_manual_ld4,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_123_45678_inv,1,intt_dilithium_123_45678,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_123_45678_inv_manual_ld4,1,intt_dilithium_123_45678_manual_ld4,invntt_u32_tomont_C,0,1,1) + // A55 -MAKE_TEST(asm_123_45678_opt_a55,0,ntt_dilithium_123_45678_opt_a55,ntt_u32_C,0,0) -MAKE_TEST(asm_123_45678_manual_st4_opt_a55,0,ntt_dilithium_123_45678_manual_st4_opt_a55,ntt_u32_C,0,0) -MAKE_TEST(asm_123_45678_w_scalar_opt_a55,0,ntt_dilithium_123_45678_w_scalar_opt_a55,ntt_u32_C,0,0) +MAKE_TEST(asm_123_45678_opt_a55,0,ntt_dilithium_123_45678_opt_a55,ntt_u32_C,0,0,1) +MAKE_TEST(asm_123_45678_manual_st4_opt_a55,0,ntt_dilithium_123_45678_manual_st4_opt_a55,ntt_u32_C,0,0,1) +MAKE_TEST(asm_123_45678_w_scalar_opt_a55,0,ntt_dilithium_123_45678_w_scalar_opt_a55,ntt_u32_C,0,0,1) + +MAKE_TEST(asm_123_45678_inv_opt_a55,1,intt_dilithium_123_45678_opt_a55,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_a55,1,intt_dilithium_123_45678_manual_ld4_opt_a55,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_1234_5678_inv_opt_a55,1,intt_dilithium_1234_5678_opt_a55,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_a55,1,intt_dilithium_1234_5678_manual_ld4_opt_a55,invntt_u32_tomont_C,0,1,1) + // A72 -MAKE_TEST(asm_123_45678_opt_a72,0,ntt_dilithium_123_45678_opt_a72,ntt_u32_C,0,0) -MAKE_TEST(asm_123_45678_manual_st4_opt_a72,0,ntt_dilithium_123_45678_manual_st4_opt_a72,ntt_u32_C,0,0) -MAKE_TEST(asm_1234_5678_opt_a72,0,ntt_dilithium_1234_5678_opt_a72,ntt_u32_C,0,0) +MAKE_TEST(asm_123_45678_opt_a72,0,ntt_dilithium_123_45678_opt_a72,ntt_u32_C,0,0,1) +MAKE_TEST(asm_123_45678_manual_st4_opt_a72,0,ntt_dilithium_123_45678_manual_st4_opt_a72,ntt_u32_C,0,0,1) +MAKE_TEST(asm_1234_5678_opt_a72,0,ntt_dilithium_1234_5678_opt_a72,ntt_u32_C,0,0,1) + +MAKE_TEST(asm_123_45678_inv_opt_a72,1,intt_dilithium_123_45678_opt_a72,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_a72,1,intt_dilithium_123_45678_manual_ld4_opt_a72,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_1234_5678_inv_opt_a72,1,intt_dilithium_1234_5678_opt_a72,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_a72,1,intt_dilithium_1234_5678_manual_ld4_opt_a72,invntt_u32_tomont_C,0,1,1) + // M1 Firestorm -MAKE_TEST(asm_123_45678_opt_m1_firestorm,0,ntt_dilithium_123_45678_opt_m1_firestorm,ntt_u32_C,0,0) -MAKE_TEST(asm_123_45678_manual_st4_opt_m1_firestorm,0,ntt_dilithium_123_45678_manual_st4_opt_m1_firestorm,ntt_u32_C,0,0) +MAKE_TEST(asm_123_45678_opt_m1_firestorm,0,ntt_dilithium_123_45678_opt_m1_firestorm,ntt_u32_C,0,0,1) +MAKE_TEST(asm_123_45678_manual_st4_opt_m1_firestorm,0,ntt_dilithium_123_45678_manual_st4_opt_m1_firestorm,ntt_u32_C,0,0,1) /* MAKE_TEST(asm_123_45678_w_scalar_opt_m1_firestorm,0,ntt_dilithium_123_45678_w_scalar_opt_m1_firestorm,ntt_u32_C,0,0) */ -MAKE_TEST(asm_1234_5678_opt_m1_firestorm,0,ntt_dilithium_1234_5678_opt_m1_firestorm,ntt_u32_C,0,0) -MAKE_TEST(asm_1234_5678_manual_st4_opt_m1_firestorm,0,ntt_dilithium_1234_5678_manual_st4_opt_m1_firestorm,ntt_u32_C,0,0) +MAKE_TEST(asm_1234_5678_opt_m1_firestorm,0,ntt_dilithium_1234_5678_opt_m1_firestorm,ntt_u32_C,0,0,1) +MAKE_TEST(asm_1234_5678_manual_st4_opt_m1_firestorm,0,ntt_dilithium_1234_5678_manual_st4_opt_m1_firestorm,ntt_u32_C,0,0,1) + +MAKE_TEST(asm_123_45678_inv_opt_m1_firestorm,1,intt_dilithium_123_45678_opt_m1_firestorm,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_m1_firestorm,1,intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_1234_5678_inv_opt_m1_firestorm,1,intt_dilithium_1234_5678_opt_m1_firestorm,invntt_u32_tomont_C,0,1,1) +/* MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_m1_firestorm,1,intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm,invntt_u32_tomont_C,0,1) */ + // M1 Icestorm -MAKE_TEST(asm_123_45678_opt_m1_icestorm,0,ntt_dilithium_123_45678_opt_m1_icestorm,ntt_u32_C,0,0) -MAKE_TEST(asm_123_45678_manual_st4_opt_m1_icestorm,0,ntt_dilithium_123_45678_manual_st4_opt_m1_icestorm,ntt_u32_C,0,0) -MAKE_TEST(asm_123_45678_w_scalar_opt_m1_icestorm,0,ntt_dilithium_123_45678_w_scalar_opt_m1_icestorm,ntt_u32_C,0,0) -MAKE_TEST(asm_1234_5678_opt_m1_icestorm,0,ntt_dilithium_1234_5678_opt_m1_icestorm,ntt_u32_C,0,0) -MAKE_TEST(asm_1234_5678_manual_st4_opt_m1_icestorm,0,ntt_dilithium_1234_5678_manual_st4_opt_m1_icestorm,ntt_u32_C,0,0) +MAKE_TEST(asm_123_45678_opt_m1_icestorm,0,ntt_dilithium_123_45678_opt_m1_icestorm,ntt_u32_C,0,0,1) +MAKE_TEST(asm_123_45678_manual_st4_opt_m1_icestorm,0,ntt_dilithium_123_45678_manual_st4_opt_m1_icestorm,ntt_u32_C,0,0,1) +MAKE_TEST(asm_123_45678_w_scalar_opt_m1_icestorm,0,ntt_dilithium_123_45678_w_scalar_opt_m1_icestorm,ntt_u32_C,0,0,1) +MAKE_TEST(asm_1234_5678_opt_m1_icestorm,0,ntt_dilithium_1234_5678_opt_m1_icestorm,ntt_u32_C,0,0,1) +MAKE_TEST(asm_1234_5678_manual_st4_opt_m1_icestorm,0,ntt_dilithium_1234_5678_manual_st4_opt_m1_icestorm,ntt_u32_C,0,0,1) + +/* MAKE_TEST(asm_123_45678_inv_opt_m1_icestorm,1,intt_dilithium_123_45678_opt_m1_icestorm,invntt_u32_tomont_C,0,1) +MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_m1_icestorm,1,intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm,invntt_u32_tomont_C,0,1) +MAKE_TEST(asm_1234_5678_inv_opt_m1_icestorm,1,intt_dilithium_1234_5678_opt_m1_icestorm,invntt_u32_tomont_C,0,1) +MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_m1_icestorm,1,intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm,invntt_u32_tomont_C,0,1) */ + // Other -MAKE_TEST(neonntt_fwd,0,ntt,ntt_u32_C,0,0) -MAKE_TEST(pqclean_ntt_fwd,0,pqclean_ntt,ntt_u32_C,0,0) -MAKE_TEST(neonntt_inv,1,invntt_tomont,invntt_u32_tomont_C,0,0) +MAKE_TEST(neonntt_fwd,0,ntt,ntt_u32_C,0,0,1) +MAKE_TEST(pqclean_ntt_fwd,0,pqclean_ntt,ntt_u32_C,0,0,1) +MAKE_TEST(neonntt_inv,1,invntt_tomont,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(pqclean_ntt_inv,1,pqclean_invntt_tomont,invntt_u32_tomont_C,0,0,1) uint64_t t0, t1; uint64_t cycles[TEST_COUNT]; @@ -404,29 +447,54 @@ MAKE_BENCH(asm_1234_5678_inv,intt_dilithium_1234_5678) MAKE_BENCH(asm_1234_5678_inv_manual_ld4,intt_dilithium_1234_5678_manual_ld4) MAKE_BENCH(asm_123_45678_inv,intt_dilithium_123_45678) MAKE_BENCH(asm_123_45678_inv_manual_ld4,intt_dilithium_123_45678_manual_ld4) + // A55 MAKE_BENCH(asm_123_45678_opt_a55,ntt_dilithium_123_45678_opt_a55) MAKE_BENCH(asm_123_45678_manual_st4_opt_a55,ntt_dilithium_123_45678_manual_st4_opt_a55) MAKE_BENCH(asm_123_45678_w_scalar_opt_a55,ntt_dilithium_123_45678_w_scalar_opt_a55) + +MAKE_BENCH(asm_123_45678_inv_opt_a55,intt_dilithium_123_45678_opt_a55) +MAKE_BENCH(asm_123_45678_inv_manual_ld4_opt_a55,intt_dilithium_123_45678_manual_ld4_opt_a55) +MAKE_BENCH(asm_1234_5678_inv_opt_a55,intt_dilithium_1234_5678_opt_a55) +MAKE_BENCH(asm_1234_5678_inv_manual_ld4_opt_a55,intt_dilithium_1234_5678_manual_ld4_opt_a55) + // A72 MAKE_BENCH(asm_123_45678_opt_a72,ntt_dilithium_123_45678_opt_a72) MAKE_BENCH(asm_123_45678_manual_st4_opt_a72,ntt_dilithium_123_45678_manual_st4_opt_a72) MAKE_BENCH(asm_1234_5678_opt_a72,ntt_dilithium_1234_5678_opt_a72) + +MAKE_BENCH(asm_123_45678_inv_opt_a72,intt_dilithium_123_45678_opt_a72) +MAKE_BENCH(asm_123_45678_inv_manual_ld4_opt_a72,intt_dilithium_123_45678_manual_ld4_opt_a72) +MAKE_BENCH(asm_1234_5678_inv_opt_a72,intt_dilithium_1234_5678_opt_a72) +MAKE_BENCH(asm_1234_5678_inv_manual_ld4_opt_a72,intt_dilithium_1234_5678_manual_ld4_opt_a72) + // M1 Firestorm MAKE_BENCH(asm_123_45678_opt_m1_firestorm,ntt_dilithium_123_45678_opt_m1_firestorm) MAKE_BENCH(asm_123_45678_manual_st4_opt_m1_firestorm,ntt_dilithium_123_45678_manual_st4_opt_m1_firestorm) /* MAKE_BENCH(asm_123_45678_w_scalar_opt_m1_firestorm,ntt_dilithium_123_45678_w_scalar_opt_m1_firestorm) */ MAKE_BENCH(asm_1234_5678_opt_m1_firestorm,ntt_dilithium_1234_5678_opt_m1_firestorm) MAKE_BENCH(asm_1234_5678_manual_st4_opt_m1_firestorm,ntt_dilithium_1234_5678_manual_st4_opt_m1_firestorm) + +MAKE_BENCH(asm_123_45678_inv_opt_m1_firestorm,intt_dilithium_123_45678_opt_m1_firestorm) +MAKE_BENCH(asm_123_45678_inv_manual_ld4_opt_m1_firestorm,intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm) +MAKE_BENCH(asm_1234_5678_inv_opt_m1_firestorm,intt_dilithium_1234_5678_opt_m1_firestorm) +/* MAKE_BENCH(asm_1234_5678_inv_manual_ld4_opt_m1_firestorm,intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm) */ + // M1 Icestorm MAKE_BENCH(asm_123_45678_opt_m1_icestorm,ntt_dilithium_123_45678_opt_m1_icestorm) MAKE_BENCH(asm_123_45678_manual_st4_opt_m1_icestorm,ntt_dilithium_123_45678_manual_st4_opt_m1_icestorm) MAKE_BENCH(asm_123_45678_w_scalar_opt_m1_icestorm,ntt_dilithium_123_45678_w_scalar_opt_m1_icestorm) MAKE_BENCH(asm_1234_5678_opt_m1_icestorm,ntt_dilithium_1234_5678_opt_m1_icestorm) MAKE_BENCH(asm_1234_5678_manual_st4_opt_m1_icestorm,ntt_dilithium_1234_5678_manual_st4_opt_m1_icestorm) + +/* MAKE_BENCH(asm_123_45678_inv_opt_m1_icestorm,intt_dilithium_123_45678_opt_m1_icestorm) +MAKE_BENCH(asm_123_45678_inv_manual_ld4_opt_m1_icestorm,intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm) +MAKE_BENCH(asm_1234_5678_inv_opt_m1_icestorm,intt_dilithium_1234_5678_opt_m1_icestorm) +MAKE_BENCH(asm_1234_5678_inv_manual_ld4_opt_m1_icestorm,intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm) */ // Other MAKE_BENCH(neonntt_fwd,ntt) MAKE_BENCH(pqclean_ntt_fwd,pqclean_ntt) +MAKE_BENCH(pqclean_ntt_inv,pqclean_invntt_tomont) int main( void ) { @@ -449,25 +517,47 @@ int main( void ) bench_ntt_asm_123_45678_opt_a55(); bench_ntt_asm_123_45678_manual_st4_opt_a55(); bench_ntt_asm_123_45678_w_scalar_opt_a55(); + + bench_ntt_asm_123_45678_inv_opt_a55(); + bench_ntt_asm_123_45678_inv_manual_ld4_opt_a55(); + bench_ntt_asm_1234_5678_inv_opt_a55(); + bench_ntt_asm_1234_5678_inv_manual_ld4_opt_a55(); // A72 bench_ntt_asm_123_45678_opt_a72(); bench_ntt_asm_123_45678_manual_st4_opt_a72(); bench_ntt_asm_1234_5678_opt_a72(); + + bench_ntt_asm_123_45678_inv_opt_a72(); + bench_ntt_asm_123_45678_inv_manual_ld4_opt_a72(); + bench_ntt_asm_1234_5678_inv_opt_a72(); + bench_ntt_asm_1234_5678_inv_manual_ld4_opt_a72(); // M1 Firestorm bench_ntt_asm_123_45678_opt_m1_firestorm(); bench_ntt_asm_123_45678_manual_st4_opt_m1_firestorm(); /* bench_ntt_asm_123_45678_w_scalar_opt_m1_firestorm(); */ bench_ntt_asm_1234_5678_opt_m1_firestorm(); bench_ntt_asm_1234_5678_manual_st4_opt_m1_firestorm(); + + bench_ntt_asm_123_45678_inv_opt_m1_firestorm(); + bench_ntt_asm_123_45678_inv_manual_ld4_opt_m1_firestorm(); + bench_ntt_asm_1234_5678_inv_opt_m1_firestorm(); + /* bench_ntt_asm_1234_5678_inv_manual_ld4_opt_m1_firestorm(); */ + // M1 Icestorm bench_ntt_asm_123_45678_opt_m1_icestorm(); bench_ntt_asm_123_45678_manual_st4_opt_m1_icestorm(); bench_ntt_asm_123_45678_w_scalar_opt_m1_icestorm(); bench_ntt_asm_1234_5678_opt_m1_icestorm(); bench_ntt_asm_1234_5678_manual_st4_opt_m1_icestorm(); + + /* bench_ntt_asm_123_45678_inv_opt_m1_icestorm(); + bench_ntt_asm_123_45678_inv_manual_ld4_opt_m1_icestorm(); + bench_ntt_asm_1234_5678_inv_opt_m1_icestorm(); + bench_ntt_asm_1234_5678_inv_manual_ld4_opt_m1_icestorm(); */ // other bench_ntt_neonntt_fwd(); bench_ntt_pqclean_ntt_fwd(); + bench_ntt_pqclean_ntt_inv(); disable_cyclecounter(); @@ -523,6 +613,23 @@ int main( void ) { return 1; } + + if (test_ntt_asm_123_45678_inv_opt_a55() != 0) + { + return 1; + } + if (test_ntt_asm_123_45678_inv_manual_ld4_opt_a55() != 0) + { + return 1; + } + if (test_ntt_asm_1234_5678_inv_opt_a55() != 0) + { + return 1; + } + if (test_ntt_asm_1234_5678_inv_manual_ld4_opt_a55() != 0) + { + return 1; + } // A72 if (test_ntt_asm_123_45678_opt_a72() != 0) { @@ -536,6 +643,24 @@ int main( void ) { return 1; } + + if (test_ntt_asm_123_45678_inv_opt_a72() != 0) + { + return 1; + } + if (test_ntt_asm_123_45678_inv_manual_ld4_opt_a72() != 0) + { + return 1; + } + if (test_ntt_asm_1234_5678_inv_opt_a72() != 0) + { + return 1; + } + if (test_ntt_asm_1234_5678_inv_manual_ld4_opt_a72() != 0) + { + return 1; + } + // M1 Firestorm if (test_ntt_asm_123_45678_opt_m1_firestorm() != 0) { @@ -557,6 +682,24 @@ int main( void ) { return 1; } + + if (test_ntt_asm_123_45678_inv_opt_m1_firestorm() != 0) + { + return 1; + } + if (test_ntt_asm_123_45678_inv_manual_ld4_opt_m1_firestorm() != 0) + { + return 1; + } + if (test_ntt_asm_1234_5678_inv_opt_m1_firestorm() != 0) + { + return 1; + } + // if (test_ntt_asm_1234_5678_inv_manual_ld4_opt_m1_firestorm() != 0) + // { + // return 1; + // } + // M1 Icestorm if (test_ntt_asm_123_45678_opt_m1_icestorm() != 0) { @@ -578,6 +721,23 @@ int main( void ) { return 1; } + + // if (test_ntt_asm_123_45678_inv_opt_m1_icestorm() != 0) + // { + // return 1; + // } + // if (test_ntt_asm_123_45678_inv_manual_ld4_opt_m1_icestorm() != 0) + // { + // return 1; + // } + // if (test_ntt_asm_1234_5678_inv_opt_m1_icestorm() != 0) + // { + // return 1; + // } + // if (test_ntt_asm_1234_5678_inv_manual_ld4_opt_m1_icestorm() != 0) + // { + // return 1; + // } // other if (test_ntt_neonntt_fwd() != 0) { @@ -591,7 +751,11 @@ int main( void ) { return 1; } - + if (test_ntt_pqclean_ntt_inv() != 0) + { + return 1; + } return(0); } + \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_a55.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_a55.s new file mode 100644 index 0000000..6298588 --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_a55.s @@ -0,0 +1,1746 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_manual_ld4_opt_a55 + .global _intt_dilithium_1234_5678_manual_ld4_opt_a55 + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_manual_ld4_opt_a55: +_intt_dilithium_1234_5678_manual_ld4_opt_a55: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 + ldr q13, [x3, #80] // ..........* + // gap // ........... + // gap // ........... + // gap // ........... + ldr q6, [x0, #16] // .*......... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q24, [x0, #0] // *.......... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q14, [x0, #48] // ...*....... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q28, [x3, #32] // .......*... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q1, [x3, #48] // ........*.. + // gap // ........... + // gap // ........... + // gap // ........... + ldr q2, [x0, #32] // ..*........ + // gap // ........... + // gap // ........... + // gap // ........... + ldr q31, [x3], #(6*16) // ....*...... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q26, [x3, #-80] // ......*.... + // gap // ........... + // gap // ........... + // gap // ........... + trn1 v3.4S, v2.4S, v14.4S // .....*..... + // gap // ........... + ldr q15, [x3, #-32] // .........*. + // gap // ........... + + // original source code + // ldr q24, [x0, #0] // ..*........ + // ldr q6, [x0, #16] // .*......... + // ldr q2, [x0, #32] // ......*.... + // ldr q14, [x0, #48] // ...*....... + // ldr q31, [x3], #(6*16) // .......*... + // trn1 v3.4S, v2.4S, v14.4S // .........*. + // ldr q26, [x3, #-80] // ........*.. + // ldr q28, [x3, #-64] // ....*...... + // ldr q1, [x3, #-48] // .....*..... + // ldr q15, [x3, #-32] // ..........* + // ldr q13, [x3, #-16] // *.......... + + sub count, count, #1 +layer5678_start: + trn1 v17.4S, v24.4S, v6.4S // ....*....................................................................... + // gap // ............................................................................ + trn2 v21.4S, v24.4S, v6.4S // .....*...................................................................... + // gap // ............................................................................ + trn2 v14.4S, v2.4S, v14.4S // .......*.................................................................... + // gap // ............................................................................ + trn2 v2.2D, v17.2D, v3.2D // ........*................................................................... + // gap // ............................................................................ + trn1 v17.2D, v17.2D, v3.2D // ..........*................................................................. + // gap // ............................................................................ + trn2 v3.2D, v21.2D, v14.2D // .........*.................................................................. + // gap // ............................................................................ + trn1 v21.2D, v21.2D, v14.2D // ...........*................................................................ + // gap // ............................................................................ + sub v14.4S, v2.4S, v3.4S // .......................*.................................................... + // gap // ............................................................................ + add v2.4S, v2.4S, v3.4S // ........................*................................................... + // gap // ............................................................................ + sub v3.4S, v17.4S, v21.4S // ..................*......................................................... + // gap // ............................................................................ + add v17.4S, v17.4S, v21.4S // ...................*........................................................ + // gap // ............................................................................ + mul v21.4S, v14.4S, v15.4S // .........................*.................................................. + // gap // ............................................................................ + mul v24.4S, v3.4S, v28.4S // ....................*....................................................... + // gap // ............................................................................ + sqrdmulh v3.4S, v3.4S, v1.4S // .....................*...................................................... + // gap // ............................................................................ + sqrdmulh v14.4S, v14.4S, v13.4S // ..........................*................................................. + // gap // ............................................................................ + sub v28.4S, v17.4S, v2.4S // ............................*............................................... + // gap // ............................................................................ + add v17.4S, v17.4S, v2.4S // .............................*.............................................. + // gap // ............................................................................ + mls v24.4S, v3.4S, v29.4S // ......................*..................................................... + // gap // ............................................................................ + mls v21.4S, v14.4S, v29.4S // ...........................*................................................ + // gap // ............................................................................ + mul v14.4S, v28.4S, v31.4S // ..............................*............................................. + // gap // ............................................................................ + sqrdmulh v2.4S, v28.4S, v26.4S // ...............................*............................................ + // gap // ............................................................................ + ldr q3, [x4], #8 // ..............................................*............................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v28.4S, v24.4S, v21.4S // .................................*.......................................... + // gap // ............................................................................ + mls v14.4S, v2.4S, v29.4S // ................................*........................................... + // gap // ............................................................................ + add v21.4S, v24.4S, v21.4S // ..................................*......................................... + // gap // ............................................................................ + mul v2.4S, v28.4S, v31.4S // ...................................*........................................ + // gap // ............................................................................ + sqrdmulh v24.4S, v28.4S, v26.4S // ....................................*....................................... + // gap // ............................................................................ + trn1 v28.4S, v17.4S, v21.4S // ......................................*..................................... + // gap // ............................................................................ + trn2 v17.4S, v17.4S, v21.4S // .......................................*.................................... + // gap // ............................................................................ + ldr q21, [x4], #16 // ...............................................*............................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v2.4S, v24.4S, v29.4S // .....................................*...................................... + // gap // ............................................................................ + ldr q24, [x0, #64] // e........................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q6, [x0, #80] // .e.......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v31.4S, v14.4S, v2.4S // ........................................*................................... + // gap // ............................................................................ + trn2 v14.4S, v14.4S, v2.4S // .........................................*.................................. + // gap // ............................................................................ + ldr q2, [x0, #96] // ..e......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn2 v26.2D, v28.2D, v31.2D // ..........................................*................................. + // gap // ............................................................................ + trn2 v1.2D, v17.2D, v14.2D // ...........................................*................................ + // gap // ............................................................................ + trn1 v28.2D, v28.2D, v31.2D // ............................................*............................... + // gap // ............................................................................ + trn1 v17.2D, v17.2D, v14.2D // .............................................*.............................. + // gap // ............................................................................ + sub v14.4S, v26.4S, v1.4S // .....................................................*...................... + // gap // ............................................................................ + add v31.4S, v28.4S, v17.4S // .................................................*.......................... + // gap // ............................................................................ + sub v17.4S, v28.4S, v17.4S // ................................................*........................... + // gap // ............................................................................ + add v28.4S, v26.4S, v1.4S // ......................................................*..................... + // gap // ............................................................................ + mul v26.4S, v14.4S, v21.S[2] // .......................................................*.................... + // gap // ............................................................................ + mul v1.4S, v17.4S, v21.S[0] // ..................................................*......................... + // gap // ............................................................................ + sqrdmulh v17.4S, v17.4S, v21.S[1] // ...................................................*........................ + // gap // ............................................................................ + sqrdmulh v21.4S, v14.4S, v21.S[3] // ........................................................*................... + // gap // ............................................................................ + sub v14.4S, v31.4S, v28.4S // ..........................................................*................. + // gap // ............................................................................ + add v28.4S, v31.4S, v28.4S // ...........................................................*................ + // gap // ............................................................................ + mls v1.4S, v17.4S, v29.4S // ....................................................*....................... + // gap // ............................................................................ + mls v26.4S, v21.4S, v29.4S // .........................................................*.................. + // gap // ............................................................................ + mul v17.4S, v14.4S, v3.S[0] // ............................................................*............... + // gap // ............................................................................ + sqrdmulh v21.4S, v14.4S, v3.S[1] // .............................................................*.............. + // gap // ............................................................................ + srshr v14.4S, v28.4S, #23 // ....................................................................*....... + // gap // ............................................................................ + sub v31.4S, v1.4S, v26.4S // ...............................................................*............ + // gap // ............................................................................ + add v26.4S, v1.4S, v26.4S // ................................................................*........... + // gap // ............................................................................ + mls v17.4S, v21.4S, v29.4S // ..............................................................*............. + // gap // ............................................................................ + mul v21.4S, v31.4S, v3.S[0] // .................................................................*.......... + // gap // ............................................................................ + sqrdmulh v3.4S, v31.4S, v3.S[1] // ..................................................................*......... + // gap // ............................................................................ + mls v28.4S, v14.4S, v29.4S // .....................................................................*...... + // gap // ............................................................................ + srshr v14.4S, v26.4S, #23 // ......................................................................*..... + // gap // ............................................................................ + str q17, [x0, #32] // ..........................................................................*. + // gap // ............................................................................ + mls v21.4S, v3.4S, v29.4S // ...................................................................*........ + // gap // ............................................................................ + mls v26.4S, v14.4S, v29.4S // .......................................................................*.... + // gap // ............................................................................ + str q28, [x0], #(16*4) // ........................................................................*... + // gap // ............................................................................ + ldr q14, [x0, #48] // ...e........................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q26, [x0, #-48] // .........................................................................*.. + // gap // ............................................................................ + ldr q31, [x3], #(6*16) // ............e............................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q21, [x0, #-16] // ...........................................................................* + // gap // ............................................................................ + trn1 v3.4S, v2.4S, v14.4S // ......e..................................................................... + // gap // ............................................................................ + ldr q26, [x3, #-80] // .............e.............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q28, [x3, #-64] // ..............e............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q1, [x3, #-48] // ...............e............................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q15, [x3, #-32] // ................e........................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q13, [x3, #-16] // .................e.......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + + // original source code + // ldr q8, [x0, #(16*0)] // e............................................|..............................e...................................... + // ldr q9, [x0, #(16*1)] // .e...........................................|...............................e..................................... + // ldr q10, [x0, #(16*2)] // ....e........................................|..................................e.................................. + // ldr q11, [x0, #(16*3)] // ...................................e.........|.................................................................e... + // trn1 v25.4s, v8.4s, v9.4s // .............................................*..................................................................... + // trn2 v26.4s, v8.4s, v9.4s // .............................................|*.................................................................... + // trn1 v27.4s, v10.4s, v11.4s // .......................................e.....|..................................................................... + // trn2 v28.4s, v10.4s, v11.4s // .............................................|.*................................................................... + // trn2 v10.2d, v25.2d, v27.2d // .............................................|..*.................................................................. + // trn2 v11.2d, v26.2d, v28.2d // .............................................|....*................................................................ + // trn1 v8.2d, v25.2d, v27.2d // .............................................|...*................................................................. + // trn1 v9.2d, v26.2d, v28.2d // .............................................|.....*............................................................... + // ldr q0, [x3], #(6*16) // .....................................e.......|...................................................................e. + // ldr q4, [x3, #(-6*16 + 1*16)] // ........................................e....|..................................................................... + // ldr q1, [x3, #(-6*16 + 2*16)] // .........................................e...|..................................................................... + // ldr q5, [x3, #(-6*16 + 3*16)] // ..........................................e..|..................................................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // ...........................................e.|..................................................................... + // ldr q6, [x3, #(-6*16 + 5*16)] // ............................................e|..................................................................... + // sub v24.4s, v8.4s, v9.4s // .............................................|........*............................................................ + // add v8.4s, v8.4s, v9.4s // .............................................|.........*........................................................... + // mul v9.4s, v24.4s, v1.4s // .............................................|...........*......................................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .............................................|............*........................................................ + // mls v9.4s, v24.4s, v29.4s // .............................................|................*.................................................... + // sub v24.4s, v10.4s, v11.4s // .............................................|......*.............................................................. + // add v10.4s, v10.4s, v11.4s // .............................................|.......*............................................................. + // mul v11.4s, v24.4s, v2.4s // .............................................|..........*.......................................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // .............................................|.............*....................................................... + // mls v11.4s, v24.4s, v29.4s // .............................................|.................*................................................... + // sub v24.4s, v8.4s, v10.4s // .............................................|..............*...................................................... + // add v8.4s, v8.4s, v10.4s // .............................................|...............*..................................................... + // mul v10.4s, v24.4s, v0.4s // .............................................|..................*.................................................. + // sqrdmulh v24.4s, v24.4s, v4.4s // .............................................|...................*................................................. + // mls v10.4s, v24.4s, v29.4s // .............................................|......................*.............................................. + // sub v24.4s, v9.4s, v11.4s // .............................................|.....................*............................................... + // add v9.4s, v9.4s, v11.4s // .............................................|.......................*............................................. + // mul v11.4s, v24.4s, v0.4s // .............................................|........................*............................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // .............................................|.........................*........................................... + // mls v11.4s, v24.4s, v29.4s // .............................................|.............................*....................................... + // trn1 v25.4s, v8.4s, v9.4s // .............................................|..........................*.......................................... + // trn2 v26.4s, v8.4s, v9.4s // .............................................|...........................*......................................... + // trn1 v27.4s, v10.4s, v11.4s // ..*..........................................|................................*.................................... + // trn2 v28.4s, v10.4s, v11.4s // ...*.........................................|.................................*................................... + // trn2 v10.2d, v25.2d, v27.2d // .....*.......................................|...................................*................................. + // trn2 v11.2d, v26.2d, v28.2d // ......*......................................|....................................*................................ + // trn1 v8.2d, v25.2d, v27.2d // .......*.....................................|.....................................*............................... + // trn1 v9.2d, v26.2d, v28.2d // ........*....................................|......................................*.............................. + // ldr q1, [x4], #8 // .............................................|....................*................................................ + // ldr q0, [x4], #16 // .............................................|............................*........................................ + // sub v24.4s, v8.4s, v9.4s // ...........*.................................|.........................................*........................... + // add v8.4s, v8.4s, v9.4s // ..........*..................................|........................................*............................ + // mul v9.4s, v24.4s, v0.s[0] // ..............*..............................|............................................*........................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............*.............................|.............................................*....................... + // mls v9.4s, v24.4s, v29.4s // ...................*.........................|.................................................*................... + // sub v24.4s, v10.4s, v11.4s // .........*...................................|.......................................*............................. + // add v10.4s, v10.4s, v11.4s // ............*................................|..........................................*.......................... + // mul v11.4s, v24.4s, v0.s[2] // .............*...............................|...........................................*......................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................*............................|..............................................*...................... + // mls v11.4s, v24.4s, v29.4s // ....................*........................|..................................................*.................. + // sub v24.4s, v8.4s, v10.4s // .................*...........................|...............................................*..................... + // add v8.4s, v8.4s, v10.4s // ..................*..........................|................................................*.................... + // mul v10.4s, v24.4s, v1.s[0] // .....................*.......................|...................................................*................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................*......................|....................................................*................ + // mls v10.4s, v24.4s, v29.4s // ..........................*..................|........................................................*............ + // sub v24.4s, v9.4s, v11.4s // ........................*....................|......................................................*.............. + // add v9.4s, v9.4s, v11.4s // .........................*...................|.......................................................*............. + // mul v11.4s, v24.4s, v1.s[0] // ...........................*.................|.........................................................*........... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ............................*................|..........................................................*.......... + // mls v11.4s, v24.4s, v29.4s // ................................*............|..............................................................*...... + // srshr v24.4S, v8.4S, #23 // .......................*.....................|.....................................................*............... + // mls v8.4s, v24.4s, v29.4s // .............................*...............|...........................................................*......... + // srshr v24.4S, v9.4S, #23 // ..............................*..............|............................................................*........ + // mls v9.4s, v24.4s, v29.4s // .................................*...........|...............................................................*..... + // str q8, [x0], #(16*4) // ..................................*..........|................................................................*.... + // str q9, [x0, #(-16*4 + 1*16)] // ....................................*........|..................................................................*.. + // str q10, [x0, #(-16*4 + 2*16)] // ...............................*.............|.............................................................*....... + // str q11, [x0, #(-16*4 + 3*16)] // ......................................*......|....................................................................* + + sub count, count, #1 + cbnz count, layer5678_start + trn1 v9.4S, v24.4S, v6.4S // *................................................................ + // gap // ................................................................. + trn2 v17.4S, v2.4S, v14.4S // ..*.............................................................. + // gap // ................................................................. + trn2 v21.4S, v24.4S, v6.4S // .*............................................................... + // gap // ................................................................. + trn1 v16.2D, v9.2D, v3.2D // ....*............................................................ + // gap // ................................................................. + trn2 v30.2D, v9.2D, v3.2D // ...*............................................................. + // gap // ................................................................. + trn1 v0.2D, v21.2D, v17.2D // ......*.......................................................... + // gap // ................................................................. + trn2 v24.2D, v21.2D, v17.2D // .....*........................................................... + // gap // ................................................................. + sub v6.4S, v16.4S, v0.4S // .........*....................................................... + // gap // ................................................................. + sub v14.4S, v30.4S, v24.4S // .......*......................................................... + // gap // ................................................................. + add v23.4S, v30.4S, v24.4S // ........*........................................................ + // gap // ................................................................. + sqrdmulh v21.4S, v6.4S, v1.4S // .............*................................................... + // gap // ................................................................. + sqrdmulh v17.4S, v14.4S, v13.4S // ..............*.................................................. + // gap // ................................................................. + mul v2.4S, v14.4S, v15.4S // ...........*..................................................... + // gap // ................................................................. + mul v14.4S, v6.4S, v28.4S // ............*.................................................... + // gap // ................................................................. + add v8.4S, v16.4S, v0.4S // ..........*...................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v2.4S, v17.4S, v29.4S // ..................*.............................................. + // gap // ................................................................. + mls v14.4S, v21.4S, v29.4S // .................*............................................... + // gap // ................................................................. + sub v17.4S, v8.4S, v23.4S // ...............*................................................. + // gap // ................................................................. + ldr q4, [x4], #8 // .....................*........................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v28.4S, v14.4S, v2.4S // ......................*.......................................... + // gap // ................................................................. + sqrdmulh v16.4S, v17.4S, v26.4S // ....................*............................................ + // gap // ................................................................. + mul v20.4S, v17.4S, v31.4S // ...................*............................................. + // gap // ................................................................. + sqrdmulh v17.4S, v28.4S, v26.4S // ..........................*...................................... + // gap // ................................................................. + mul v3.4S, v28.4S, v31.4S // .........................*....................................... + // gap // ................................................................. + add v24.4S, v14.4S, v2.4S // ........................*........................................ + // gap // ................................................................. + add v28.4S, v8.4S, v23.4S // ................*................................................ + // gap // ................................................................. + mls v20.4S, v16.4S, v29.4S // .......................*......................................... + // gap // ................................................................. + mls v3.4S, v17.4S, v29.4S // ..............................*.................................. + // gap // ................................................................. + trn2 v2.4S, v28.4S, v24.4S // ............................*.................................... + // gap // ................................................................. + ldr q21, [x4], #16 // .............................*................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + trn2 v14.4S, v20.4S, v3.4S // ................................*................................ + // gap // ................................................................. + trn1 v17.4S, v20.4S, v3.4S // ...............................*................................. + // gap // ................................................................. + trn1 v3.4S, v28.4S, v24.4S // ...........................*..................................... + // gap // ................................................................. + trn1 v30.2D, v2.2D, v14.2D // ....................................*............................ + // gap // ................................................................. + trn2 v14.2D, v2.2D, v14.2D // ..................................*.............................. + // gap // ................................................................. + trn2 v28.2D, v3.2D, v17.2D // .................................*............................... + // gap // ................................................................. + trn1 v11.2D, v3.2D, v17.2D // ...................................*............................. + // gap // ................................................................. + sub v16.4S, v28.4S, v14.4S // .....................................*........................... + // gap // ................................................................. + add v15.4S, v28.4S, v14.4S // ........................................*........................ + // gap // ................................................................. + add v19.4S, v11.4S, v30.4S // ......................................*.......................... + // gap // ................................................................. + mul v31.4S, v16.4S, v21.S[2] // .........................................*....................... + // gap // ................................................................. + sub v30.4S, v11.4S, v30.4S // .......................................*......................... + // gap // ................................................................. + sqrdmulh v14.4S, v16.4S, v21.S[3] // ............................................*.................... + // gap // ................................................................. + add v28.4S, v19.4S, v15.4S // ..............................................*.................. + // gap // ................................................................. + sqrdmulh v2.4S, v30.4S, v21.S[1] // ...........................................*..................... + // gap // ................................................................. + mul v6.4S, v30.4S, v21.S[0] // ..........................................*...................... + // gap // ................................................................. + srshr v17.4S, v28.4S, #23 // ...................................................*............. + // gap // ................................................................. + sub v21.4S, v19.4S, v15.4S // .............................................*................... + // gap // ................................................................. + mls v31.4S, v14.4S, v29.4S // ................................................*................ + // gap // ................................................................. + mls v6.4S, v2.4S, v29.4S // ...............................................*................. + // gap // ................................................................. + sqrdmulh v3.4S, v21.4S, v4.S[1] // ..................................................*.............. + // gap // ................................................................. + mul v24.4S, v21.4S, v4.S[0] // .................................................*............... + // gap // ................................................................. + mls v28.4S, v17.4S, v29.4S // .........................................................*....... + // gap // ................................................................. + sub v21.4S, v6.4S, v31.4S // ....................................................*............ + // gap // ................................................................. + add v2.4S, v6.4S, v31.4S // .....................................................*........... + // gap // ................................................................. + mls v24.4S, v3.4S, v29.4S // ......................................................*.......... + // gap // ................................................................. + mul v14.4S, v21.4S, v4.S[0] // .......................................................*......... + // gap // ................................................................. + srshr v17.4S, v2.4S, #23 // ..........................................................*...... + // gap // ................................................................. + sqrdmulh v21.4S, v21.4S, v4.S[1] // ........................................................*........ + // gap // ................................................................. + str q24, [x0, #32] // ...........................................................*..... + // gap // ................................................................. + mls v2.4S, v17.4S, v29.4S // .............................................................*... + // gap // ................................................................. + str q28, [x0], #(16*4) // ..............................................................*.. + // gap // ................................................................. + mls v14.4S, v21.4S, v29.4S // ............................................................*.... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q2, [x0, #-48] // ...............................................................*. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q14, [x0, #-16] // ................................................................* + // gap // ................................................................. + + // original source code + // trn1 v17.4S, v24.4S, v6.4S // *................................................................ + // trn2 v21.4S, v24.4S, v6.4S // ..*.............................................................. + // trn2 v14.4S, v2.4S, v14.4S // .*............................................................... + // trn2 v2.2D, v17.2D, v3.2D // ....*............................................................ + // trn1 v17.2D, v17.2D, v3.2D // ...*............................................................. + // trn2 v3.2D, v21.2D, v14.2D // ......*.......................................................... + // trn1 v21.2D, v21.2D, v14.2D // .....*........................................................... + // sub v14.4S, v2.4S, v3.4S // ........*........................................................ + // add v2.4S, v2.4S, v3.4S // .........*....................................................... + // sub v3.4S, v17.4S, v21.4S // .......*......................................................... + // add v17.4S, v17.4S, v21.4S // ..............*.................................................. + // mul v21.4S, v14.4S, v15.4S // ............*.................................................... + // mul v24.4S, v3.4S, v28.4S // .............*................................................... + // sqrdmulh v3.4S, v3.4S, v1.4S // ..........*...................................................... + // sqrdmulh v14.4S, v14.4S, v13.4S // ...........*..................................................... + // sub v28.4S, v17.4S, v2.4S // .................*............................................... + // add v17.4S, v17.4S, v2.4S // .........................*....................................... + // mls v24.4S, v3.4S, v29.4S // ................*................................................ + // mls v21.4S, v14.4S, v29.4S // ...............*................................................. + // mul v14.4S, v28.4S, v31.4S // .....................*........................................... + // sqrdmulh v2.4S, v28.4S, v26.4S // ....................*............................................ + // ldr q3, [x4], #8 // ..................*.............................................. + // sub v28.4S, v24.4S, v21.4S // ...................*............................................. + // mls v14.4S, v2.4S, v29.4S // ..........................*...................................... + // add v21.4S, v24.4S, v21.4S // ........................*........................................ + // mul v2.4S, v28.4S, v31.4S // .......................*......................................... + // sqrdmulh v24.4S, v28.4S, v26.4S // ......................*.......................................... + // trn1 v28.4S, v17.4S, v21.4S // ................................*................................ + // trn2 v17.4S, v17.4S, v21.4S // ............................*.................................... + // ldr q21, [x4], #16 // .............................*................................... + // mls v2.4S, v24.4S, v29.4S // ...........................*..................................... + // trn1 v31.4S, v14.4S, v2.4S // ...............................*................................. + // trn2 v14.4S, v14.4S, v2.4S // ..............................*.................................. + // trn2 v26.2D, v28.2D, v31.2D // ...................................*............................. + // trn2 v1.2D, v17.2D, v14.2D // ..................................*.............................. + // trn1 v28.2D, v28.2D, v31.2D // ....................................*............................ + // trn1 v17.2D, v17.2D, v14.2D // .................................*............................... + // sub v14.4S, v26.4S, v1.4S // .....................................*........................... + // add v31.4S, v28.4S, v17.4S // .......................................*......................... + // sub v17.4S, v28.4S, v17.4S // .........................................*....................... + // add v28.4S, v26.4S, v1.4S // ......................................*.......................... + // mul v26.4S, v14.4S, v21.S[2] // ........................................*........................ + // mul v1.4S, v17.4S, v21.S[0] // .............................................*................... + // sqrdmulh v17.4S, v17.4S, v21.S[1] // ............................................*.................... + // sqrdmulh v21.4S, v14.4S, v21.S[3] // ..........................................*...................... + // sub v14.4S, v31.4S, v28.4S // ...............................................*................. + // add v28.4S, v31.4S, v28.4S // ...........................................*..................... + // mls v1.4S, v17.4S, v29.4S // .................................................*............... + // mls v26.4S, v21.4S, v29.4S // ................................................*................ + // mul v17.4S, v14.4S, v3.S[0] // ...................................................*............. + // sqrdmulh v21.4S, v14.4S, v3.S[1] // ..................................................*.............. + // srshr v14.4S, v28.4S, #23 // ..............................................*.................. + // sub v31.4S, v1.4S, v26.4S // .....................................................*........... + // add v26.4S, v1.4S, v26.4S // ......................................................*.......... + // mls v17.4S, v21.4S, v29.4S // .......................................................*......... + // mul v21.4S, v31.4S, v3.S[0] // ........................................................*........ + // sqrdmulh v3.4S, v31.4S, v3.S[1] // ..........................................................*...... + // mls v28.4S, v14.4S, v29.4S // ....................................................*............ + // srshr v14.4S, v26.4S, #23 // .........................................................*....... + // str q17, [x0, #32] // ...........................................................*..... + // mls v21.4S, v3.4S, v29.4S // ..............................................................*.. + // mls v26.4S, v14.4S, v29.4S // ............................................................*.... + // str q28, [x0], #(16*4) // .............................................................*... + // str q26, [x0, #-48] // ...............................................................*. + // str q21, [x0, #-16] // ................................................................* + + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 + ldr q15, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + ldr q28, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + ldr q10, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + sub v19.4S, v28.4S, v15.4S // ..............................................*......................................................................................................................................................................................................................................... + ldr q21, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + ldr q17, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + ldr q20, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + sub v18.4S, v21.4S, v17.4S // ....................................*................................................................................................................................................................................................................................................... + mul v12.4S, v19.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... + sub v23.4S, v10.4S, v20.4S // .........................................*.............................................................................................................................................................................................................................................. + sqrdmulh v13.4S, v18.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + mul v27.4S, v18.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. + sqrdmulh v22.4S, v23.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... + mul v11.4S, v23.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ + ldr q23, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + mls v27.4S, v13.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + mls v11.4S, v22.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + ldr q18, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v19.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... + sub v22.4S, v27.4S, v11.4S // .................................................................................*...................................................................................................................................................................................................... + add v14.4S, v18.4S, v23.4S // ...........................*............................................................................................................................................................................................................................................................ + sub v23.4S, v18.4S, v23.4S // ..........................*............................................................................................................................................................................................................................................................. + mul v19.4S, v22.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + sqrdmulh v22.4S, v22.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + sqrdmulh v8.4S, v23.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... + mul v18.4S, v23.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... + ldr q13, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + add v16.4S, v28.4S, v15.4S // ...............................................*........................................................................................................................................................................................................................................ + add v23.4S, v21.4S, v17.4S // .....................................*.................................................................................................................................................................................................................................................. + ldr q17, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + ldr q28, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + add v9.4S, v17.4S, v13.4S // ....................................................*................................................................................................................................................................................................................................... + sub v13.4S, v17.4S, v13.4S // ...................................................*.................................................................................................................................................................................................................................... + mls v18.4S, v8.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + sub v15.4S, v16.4S, v9.4S // ......................................................................................*................................................................................................................................................................................................. + mul v21.4S, v13.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. + sqrdmulh v8.4S, v13.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. + sqrdmulh v17.4S, v15.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. + mul v15.4S, v15.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... + mls v12.4S, v24.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + mls v21.4S, v8.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + add v20.4S, v10.4S, v20.4S // ..........................................*............................................................................................................................................................................................................................................. + mls v15.4S, v17.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + add v27.4S, v27.4S, v11.4S // ..................................................................................*..................................................................................................................................................................................................... + add v24.4S, v12.4S, v21.4S // ............................................................................................*........................................................................................................................................................................................... + ldr q11, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + add v17.4S, v27.4S, v24.4S // ..........................................................................................................................*............................................................................................................................................................. + sub v10.4S, v27.4S, v24.4S // .........................................................................................................................*.............................................................................................................................................................. + sub v27.4S, v28.4S, v11.4S // ...............................*........................................................................................................................................................................................................................................................ + add v24.4S, v28.4S, v11.4S // ................................*....................................................................................................................................................................................................................................................... + add v13.4S, v16.4S, v9.4S // .......................................................................................*................................................................................................................................................................................................ + sqrdmulh v9.4S, v10.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + sub v16.4S, v14.4S, v24.4S // ..................................................................*..................................................................................................................................................................................................................... + sqrdmulh v11.4S, v27.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... + mul v8.4S, v27.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... + mul v27.4S, v10.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ + sub v10.4S, v23.4S, v20.4S // ............................................................................*........................................................................................................................................................................................................... + mls v8.4S, v11.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + sqrdmulh v11.4S, v16.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. + mul v28.4S, v16.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... + add v24.4S, v14.4S, v24.4S // ...................................................................*.................................................................................................................................................................................................................... + mls v27.4S, v9.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + sub v9.4S, v18.4S, v8.4S // .......................................................................*................................................................................................................................................................................................................ + ldr q16, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + sub v14.4S, v12.4S, v21.4S // ...........................................................................................*............................................................................................................................................................................................ + ldr q12, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + add v21.4S, v23.4S, v20.4S // .............................................................................*.......................................................................................................................................................................................................... + sqrdmulh v23.4S, v10.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + sub v20.4S, v16.4S, v12.4S // .....................*.................................................................................................................................................................................................................................................................. + add v16.4S, v16.4S, v12.4S // ......................*................................................................................................................................................................................................................................................................. + mls v19.4S, v22.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + mul v22.4S, v9.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. + sub v12.4S, v21.4S, v13.4S // ....................................................................................................................*................................................................................................................................................................... + add v18.4S, v18.4S, v8.4S // ........................................................................*............................................................................................................................................................................................................... + mul v8.4S, v20.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ + sqrdmulh v20.4S, v20.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... + add v13.4S, v21.4S, v13.4S // .....................................................................................................................*.................................................................................................................................................................. + sqrdmulh v21.4S, v9.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. + sqrdmulh v9.4S, v12.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ + mls v8.4S, v20.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + mul v20.4S, v12.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. + ldr q12, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + mls v28.4S, v11.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + ldr q11, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + mls v20.4S, v9.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + sub v9.4S, v12.4S, v11.4S // ................*....................................................................................................................................................................................................................................................................... + mul v10.4S, v10.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... + add v11.4S, v12.4S, v11.4S // .................*...................................................................................................................................................................................................................................................................... + sqrdmulh v12.4S, v9.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... + mul v9.4S, v9.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... + mls v10.4S, v23.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + add v23.4S, v11.4S, v16.4S // .........................................................*.............................................................................................................................................................................................................................. + mls v22.4S, v21.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + mls v9.4S, v12.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + sub v21.4S, v23.4S, v24.4S // ................................................................................................*....................................................................................................................................................................................... + add v24.4S, v23.4S, v24.4S // .................................................................................................*...................................................................................................................................................................................... + sub v23.4S, v11.4S, v16.4S // ........................................................*............................................................................................................................................................................................................................... + sub v16.4S, v9.4S, v8.4S // .............................................................*.......................................................................................................................................................................................................................... + add v9.4S, v9.4S, v8.4S // ..............................................................*......................................................................................................................................................................................................................... + mul v12.4S, v21.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... + sqrdmulh v8.4S, v16.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... + mul v16.4S, v16.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ + add v11.4S, v9.4S, v18.4S // ......................................................................................................*................................................................................................................................................................................. + sub v18.4S, v9.4S, v18.4S // .....................................................................................................*.................................................................................................................................................................................. + sqrdmulh v9.4S, v21.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... + mls v16.4S, v8.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + mul v21.4S, v18.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ + sqrdmulh v8.4S, v18.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... + mls v12.4S, v9.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + sub v18.4S, v11.4S, v17.4S // .............................................................................................................................................*.......................................................................................................................................... + add v9.4S, v11.4S, v17.4S // ..............................................................................................................................................*......................................................................................................................................... + mls v21.4S, v8.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + sqrdmulh v11.4S, v23.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ + mul v8.4S, v23.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. + mul v23.4S, v18.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + sqrdmulh v17.4S, v14.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... + mul v14.4S, v14.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... + mls v8.4S, v11.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + sqrdmulh v11.4S, v18.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + add v18.4S, v10.4S, v15.4S // ...............................................................................................................................*........................................................................................................................................................ + mls v14.4S, v17.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + sub v17.4S, v8.4S, v28.4S // ..........................................................................................................*............................................................................................................................................................................. + add v28.4S, v8.4S, v28.4S // ...........................................................................................................*............................................................................................................................................................................ + sub v15.4S, v10.4S, v15.4S // ..............................................................................................................................*......................................................................................................................................................... + add v10.4S, v19.4S, v14.4S // ....................................................................................................................................*................................................................................................................................................... + add v8.4S, v21.4S, v27.4S // ..................................................................................................................................................................*..................................................................................................................... + sub v21.4S, v21.4S, v27.4S // .................................................................................................................................................................*...................................................................................................................... + sub v19.4S, v19.4S, v14.4S // ...................................................................................................................................*.................................................................................................................................................... + mul v14.4S, v15.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + sqrdmulh v27.4S, v15.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + mul v15.4S, v21.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + sqrdmulh v21.4S, v21.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + mls v23.4S, v11.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + sqrdmulh v11.4S, v19.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mls v14.4S, v27.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + mls v15.4S, v21.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v21.4S, v31.4S, v23.4S // ....................................................................................................................................................................................*................................................................................................... + cmge v27.4S, v23.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + mul v19.4S, v19.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + sub v27.4S, v21.4S, v27.4S // ......................................................................................................................................................................................*................................................................................................. + sub count, count, #1 +layer1234_start: + mul v21.4S, v8.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + sqrdmulh v8.4S, v8.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + mls v19.4S, v11.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + sub v11.4S, v28.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... + mls v23.4S, v27.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + mls v21.4S, v8.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + mul v8.4S, v17.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + sqrdmulh v27.4S, v17.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + add v28.4S, v28.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... + cmge v18.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + cmge v17.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + mls v8.4S, v27.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + sub v18.4S, v18.4S, v17.4S // ......................................................................................................................................................................................................................................................................*................. + sqrdmulh v17.4S, v11.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + cmge v27.4S, v31.4S, v15.4S // ....................................................................................................................................................................................................*................................................................................... + mul v11.4S, v11.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + str q23, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + add v23.4S, v16.4S, v22.4S // ................................................................................................................*....................................................................................................................................................................... + sub v22.4S, v16.4S, v22.4S // ...............................................................................................................*........................................................................................................................................................................ + mls v11.4S, v17.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + add v16.4S, v12.4S, v20.4S // .............................................................................................................................................................*.......................................................................................................................... + mul v17.4S, v22.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + sub v12.4S, v12.4S, v20.4S // ............................................................................................................................................................*........................................................................................................................... + add v20.4S, v8.4S, v14.4S // .......................................................................................................................................................................*................................................................................................................ + sub v14.4S, v8.4S, v14.4S // ......................................................................................................................................................................*................................................................................................................. + mls v21.4S, v18.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + sqrdmulh v8.4S, v20.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + mul v18.4S, v20.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + sub v20.4S, v24.4S, v13.4S // ........................................................................................................................................*............................................................................................................................................... + add v24.4S, v24.4S, v13.4S // .........................................................................................................................................*.............................................................................................................................................. + mls v18.4S, v8.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + mul v8.4S, v20.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + sqrdmulh v20.4S, v20.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + sqrdmulh v22.4S, v22.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + cmge v13.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................................*............... + str q21, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + mls v8.4S, v20.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + cmge v20.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + mls v17.4S, v22.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + sub v20.4S, v13.4S, v20.4S // ..........................................................................................................................................................................................................................................................................*............. + cmge v21.4S, v31.4S, v8.4S // ................................................................................................................................................................................*....................................................................................................... + cmge v13.4S, v8.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + mls v18.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + sub v21.4S, v21.4S, v13.4S // ..................................................................................................................................................................................*..................................................................................................... + sqrdmulh v13.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mul v20.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + str q18, [x1, #384] // ......................................................................................................................................................................................................................................................................................*. + mul v18.4S, v24.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sqrdmulh v22.4S, v24.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + mls v20.4S, v13.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + add v9.4S, v23.4S, v10.4S // ........................................................................................................................................................*............................................................................................................................... + mul v24.4S, v28.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + mls v18.4S, v22.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + mls v8.4S, v21.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + cmge v21.4S, v11.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + cmge v13.4S, v20.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + cmge v22.4S, v15.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + sub v23.4S, v23.4S, v10.4S // .......................................................................................................................................................*................................................................................................................................ + sub v22.4S, v27.4S, v22.4S // ......................................................................................................................................................................................................*................................................................................. + sqrdmulh v10.4S, v12.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + cmge v27.4S, v31.4S, v20.4S // ....................................................................................................................................................................................................................................................*................................... + mls v15.4S, v22.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + sub v22.4S, v27.4S, v13.4S // ......................................................................................................................................................................................................................................................*................................. + sqrdmulh v28.4S, v28.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + add v27.4S, v17.4S, v19.4S // ............................................................................................................................................................................*........................................................................................................... + mls v20.4S, v22.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + sqrdmulh v13.4S, v23.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mls v24.4S, v28.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + mul v28.4S, v27.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + str q20, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + mul v20.4S, v23.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v22.4S, v27.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + cmge v23.4S, v24.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + cmge v27.4S, v31.4S, v24.4S // ........................................................................................................................................................................................................................................................*............................... + mls v20.4S, v13.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + mls v28.4S, v22.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + sub v13.4S, v27.4S, v23.4S // ..........................................................................................................................................................................................................................................................*............................. + sub v17.4S, v17.4S, v19.4S // ...........................................................................................................................................................................*............................................................................................................ + str q8, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + cmge v23.4S, v31.4S, v28.4S // ............................................................................................................................................................................................................................................................................*........... + cmge v19.4S, v28.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + sqrdmulh v22.4S, v17.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + sub v27.4S, v23.4S, v19.4S // ..............................................................................................................................................................................................................................................................................*......... + mul v8.4S, v12.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + mul v12.4S, v14.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + sqrdmulh v14.4S, v14.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + ldr q23, [x1, #144] // ..e..................................................................................................................................................................................................................................................................................... + mls v24.4S, v13.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + mls v12.4S, v14.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + ldr q19, [x1, #208] // ...e.................................................................................................................................................................................................................................................................................... + mls v28.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + cmge v13.4S, v12.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v14.4S, v31.4S, v12.4S // ........................................................................................................................................................................................................*............................................................................... + cmge v27.4S, v20.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + sub v14.4S, v14.4S, v13.4S // ..........................................................................................................................................................................................................*............................................................................. + cmge v13.4S, v31.4S, v11.4S // ........................................................................................................................................................................................*............................................................................................... + mls v8.4S, v10.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + mls v12.4S, v14.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + sub v21.4S, v13.4S, v21.4S // ..........................................................................................................................................................................................*............................................................................................. + str q15, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + mul v14.4S, v16.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + mls v11.4S, v21.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + str q12, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + sqrdmulh v13.4S, v16.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + sub v15.4S, v23.4S, v19.4S // .....................e.................................................................................................................................................................................................................................................................. + str q11, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + add v11.4S, v23.4S, v19.4S // ......................e................................................................................................................................................................................................................................................................. + ldr q23, [x1, #80] // .e...................................................................................................................................................................................................................................................................................... + str q28, [x1, #448] // .......................................................................................................................................................................................................................................................................................* + cmge v16.4S, v31.4S, v20.4S // ............................................................................................................................................................................................*........................................................................................... + cmge v21.4S, v8.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + sub v12.4S, v16.4S, v27.4S // ..............................................................................................................................................................................................*......................................................................................... + ldr q10, [x1, #16] // e....................................................................................................................................................................................................................................................................................... + ldr q27, [x1, #656] // ..........e............................................................................................................................................................................................................................................................................. + sub v19.4S, v10.4S, v23.4S // ................e....................................................................................................................................................................................................................................................................... + mul v16.4S, v15.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ + sqrdmulh v28.4S, v15.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... + mul v15.4S, v19.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... + sqrdmulh v19.4S, v19.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... + add v23.4S, v10.4S, v23.4S // .................e...................................................................................................................................................................................................................................................................... + mls v20.4S, v12.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mls v16.4S, v28.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. + mls v15.4S, v19.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... + sub v19.4S, v23.4S, v11.4S // ........................................................e............................................................................................................................................................................................................................... + str q20, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + add v20.4S, v23.4S, v11.4S // .........................................................e.............................................................................................................................................................................................................................. + mul v12.4S, v17.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + cmge v28.4S, v31.4S, v8.4S // ................................................................................................................................................................................................*....................................................................................... + mul v10.4S, v9.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + sub v11.4S, v28.4S, v21.4S // ..................................................................................................................................................................................................*..................................................................................... + ldr q23, [x1, #720] // ...........e............................................................................................................................................................................................................................................................................ + mls v8.4S, v11.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + add v11.4S, v15.4S, v16.4S // ..............................................................e......................................................................................................................................................................................................................... + sub v28.4S, v15.4S, v16.4S // .............................................................e.......................................................................................................................................................................................................................... + sqrdmulh v16.4S, v9.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + ldr q15, [x1, #912] // ..............e......................................................................................................................................................................................................................................................................... + sub v17.4S, v27.4S, v23.4S // .........................................e.............................................................................................................................................................................................................................................. + mls v10.4S, v16.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + add v16.4S, v27.4S, v23.4S // ..........................................e............................................................................................................................................................................................................................................. + mls v12.4S, v22.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + cmge v21.4S, v18.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + cmge v9.4S, v31.4S, v18.4S // ................................................................................................................................................................................................................................................*....................................... + mls v14.4S, v13.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + sub v23.4S, v9.4S, v21.4S // ..................................................................................................................................................................................................................................................*..................................... + ldr q22, [x1, #272] // ....e................................................................................................................................................................................................................................................................................... + cmge v13.4S, v31.4S, v14.4S // ................................................................................................................................................................................................................................................................*....................... + cmge v21.4S, v14.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + sqrdmulh v27.4S, v19.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ + sub v9.4S, v13.4S, v21.4S // ..................................................................................................................................................................................................................................................................*..................... + ldr q13, [x1, #784] // ............e........................................................................................................................................................................................................................................................................... + mls v14.4S, v9.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + mls v18.4S, v23.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + ldr q23, [x1, #848] // .............e.......................................................................................................................................................................................................................................................................... + str q14, [x1, #256] // ....................................................................................................................................................................................................................................................................................*... + cmge v14.4S, v12.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + str q18, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + cmge v18.4S, v31.4S, v10.4S // ............................................................................................................................................................................................................................................................*........................... + str q24, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + add v24.4S, v13.4S, v23.4S // ...............................................e........................................................................................................................................................................................................................................ + sub v13.4S, v13.4S, v23.4S // ..............................................e......................................................................................................................................................................................................................................... + mul v23.4S, v19.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. + cmge v9.4S, v31.4S, v12.4S // ............................................................................................................................................................................................................*........................................................................... + ldr q19, [x1, #512] // ........e............................................................................................................................................................................................................................................................................... + mls v23.4S, v27.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... + ldr q27, [x1, #576] // .........e.............................................................................................................................................................................................................................................................................. + sub v9.4S, v9.4S, v14.4S // ..............................................................................................................................................................................................................*......................................................................... + str q8, [x1, #752] // ....................................................................................................................................................................................................................*................................................................... + add v14.4S, v19.4S, v27.4S // .....................................e.................................................................................................................................................................................................................................................. + ldr q8, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ + sub v21.4S, v19.4S, v27.4S // ....................................e................................................................................................................................................................................................................................................... + sqrdmulh v19.4S, v13.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... + sub v27.4S, v15.4S, v8.4S // ...................................................e.................................................................................................................................................................................................................................... + mul v13.4S, v13.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... + add v8.4S, v15.4S, v8.4S // ....................................................e................................................................................................................................................................................................................................... + mul v15.4S, v27.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. + sqrdmulh v27.4S, v27.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. + mls v13.4S, v19.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... + mul v19.4S, v21.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. + sqrdmulh v21.4S, v21.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ + mls v15.4S, v27.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ + mls v12.4S, v9.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + cmge v9.4S, v10.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + mls v19.4S, v21.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... + sub v27.4S, v13.4S, v15.4S // ...........................................................................................e............................................................................................................................................................................................ + add v13.4S, v13.4S, v15.4S // ............................................................................................e........................................................................................................................................................................................... + sub v18.4S, v18.4S, v9.4S // ..............................................................................................................................................................................................................................................................*......................... + mul v21.4S, v27.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... + sqrdmulh v9.4S, v27.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... + mls v10.4S, v18.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + mul v18.4S, v17.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ + sqrdmulh v27.4S, v17.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... + mls v21.4S, v9.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ + sub v17.4S, v14.4S, v16.4S // ............................................................................e........................................................................................................................................................................................................... + sqrdmulh v15.4S, v28.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... + mls v18.4S, v27.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... + sqrdmulh v9.4S, v17.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ + ldr q27, [x1, #320] // .....e.................................................................................................................................................................................................................................................................................. + add v16.4S, v14.4S, v16.4S // .............................................................................e.......................................................................................................................................................................................................... + str q10, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + sub v14.4S, v22.4S, v27.4S // ..........................e............................................................................................................................................................................................................................................................. + str q12, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ + add v12.4S, v19.4S, v18.4S // ..................................................................................e..................................................................................................................................................................................................... + mul v10.4S, v14.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v14.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... + add v22.4S, v22.4S, v27.4S // ...........................e............................................................................................................................................................................................................................................................ + sub v27.4S, v12.4S, v13.4S // .........................................................................................................................e.............................................................................................................................................................. + add v12.4S, v12.4S, v13.4S // ..........................................................................................................................e............................................................................................................................................................. + mls v10.4S, v14.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... + sqrdmulh v13.4S, v27.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... + mul v14.4S, v27.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + add v27.4S, v24.4S, v8.4S // .......................................................................................e................................................................................................................................................................................................ + sub v8.4S, v24.4S, v8.4S // ......................................................................................e................................................................................................................................................................................................. + ldr q24, [x1, #448] // .......e................................................................................................................................................................................................................................................................................ + mls v14.4S, v13.4S, v29.4S // .............................................................................................................................e.......................................................................................................................................................... + add v13.4S, v16.4S, v27.4S // .....................................................................................................................e.................................................................................................................................................................. + sub v27.4S, v16.4S, v27.4S // ....................................................................................................................e................................................................................................................................................................... + mul v16.4S, v28.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ + sub v28.4S, v19.4S, v18.4S // .................................................................................e...................................................................................................................................................................................................... + mul v18.4S, v8.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... + sqrdmulh v19.4S, v8.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. + ldr q8, [x1, #384] // ......e................................................................................................................................................................................................................................................................................. + mls v16.4S, v15.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... + mul v15.4S, v28.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... + mls v18.4S, v19.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. + sub v19.4S, v8.4S, v24.4S // ...............................e........................................................................................................................................................................................................................................................ + sqrdmulh v28.4S, v28.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... + add v24.4S, v8.4S, v24.4S // ................................e....................................................................................................................................................................................................................................................... + mul v8.4S, v19.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... + sqrdmulh v19.4S, v19.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... + mls v15.4S, v28.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. + sub v28.4S, v22.4S, v24.4S // ..................................................................e..................................................................................................................................................................................................................... + mul v17.4S, v17.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... + mls v8.4S, v19.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... + mul v19.4S, v28.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... + sqrdmulh v28.4S, v28.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. + mls v17.4S, v9.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... + add v9.4S, v10.4S, v8.4S // ........................................................................e............................................................................................................................................................................................................... + sub v8.4S, v10.4S, v8.4S // .......................................................................e................................................................................................................................................................................................................ + mls v19.4S, v28.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. + sub v28.4S, v11.4S, v9.4S // .....................................................................................................e.................................................................................................................................................................................. + sub v10.4S, v17.4S, v18.4S // ..............................................................................................................................e......................................................................................................................................................... + add v11.4S, v11.4S, v9.4S // ......................................................................................................e................................................................................................................................................................................. + mul v9.4S, v28.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ + sqrdmulh v28.4S, v28.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... + add v24.4S, v22.4S, v24.4S // ...................................................................e.................................................................................................................................................................................................................... + add v18.4S, v17.4S, v18.4S // ...............................................................................................................................e........................................................................................................................................................ + mul v22.4S, v8.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. + mls v9.4S, v28.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. + add v28.4S, v23.4S, v19.4S // ...........................................................................................................e............................................................................................................................................................................ + sub v17.4S, v23.4S, v19.4S // ..........................................................................................................e............................................................................................................................................................................. + sub v23.4S, v20.4S, v24.4S // ................................................................................................e....................................................................................................................................................................................... + sub v19.4S, v9.4S, v14.4S // .................................................................................................................................................................e...................................................................................................................... + add v24.4S, v20.4S, v24.4S // .................................................................................................e...................................................................................................................................................................................... + sqrdmulh v20.4S, v8.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. + add v8.4S, v9.4S, v14.4S // ..................................................................................................................................................................e..................................................................................................................... + mul v14.4S, v10.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... + add v9.4S, v11.4S, v12.4S // ..............................................................................................................................................e......................................................................................................................................... + sub v11.4S, v11.4S, v12.4S // .............................................................................................................................................e.......................................................................................................................................... + mul v12.4S, v23.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... + mls v22.4S, v20.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ + sqrdmulh v10.4S, v10.4S, v1.S[1] // .................................................................................................................................e...................................................................................................................................................... + sqrdmulh v20.4S, v23.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... + mul v23.4S, v11.4S, v0.S[0] // ...............................................................................................................................................e........................................................................................................................................ + sqrdmulh v11.4S, v11.4S, v0.S[1] // ................................................................................................................................................e....................................................................................................................................... + mls v14.4S, v10.4S, v29.4S // ..................................................................................................................................e..................................................................................................................................................... + add v10.4S, v15.4S, v21.4S // ....................................................................................................................................e................................................................................................................................................... + mls v12.4S, v20.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... + sub v21.4S, v15.4S, v21.4S // ...................................................................................................................................e.................................................................................................................................................... + mul v20.4S, v27.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. + sqrdmulh v27.4S, v27.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ + mls v23.4S, v11.4S, v29.4S // .................................................................................................................................................e...................................................................................................................................... + sqrdmulh v11.4S, v19.4S, v0.S[1] // ....................................................................................................................................................................e................................................................................................................... + mul v15.4S, v19.4S, v0.S[0] // ...................................................................................................................................................................e.................................................................................................................... + mls v20.4S, v27.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... + cmge v27.4S, v31.4S, v23.4S // ....................................................................................................................................................................................e................................................................................................... + cmge v19.4S, v23.4S, v30.4S // .....................................................................................................................................................................................e.................................................................................................. + mls v15.4S, v11.4S, v29.4S // .....................................................................................................................................................................e.................................................................................................................. + sub v27.4S, v27.4S, v19.4S // ......................................................................................................................................................................................e................................................................................................. + sqrdmulh v11.4S, v21.4S, v1.S[1] // ......................................................................................................................................e................................................................................................................................................. + mul v19.4S, v21.4S, v1.S[0] // .....................................................................................................................................e.................................................................................................................................................. + + // original source code + // ldr q8, [x1, #0] // ..........................e.......................................................................................................................................................................|...............................................................................................................e........................................................................................ + // ldr q9, [x1, #(1*(512/8))] // .....................e............................................................................................................................................................................|..........................................................................................................e............................................................................................. + // ldr q10, [x1, #(2*(512/8))] // e.................................................................................................................................................................................................|.....................................................................................e.................................................................................................................. + // ldr q11, [x1, #(3*(512/8))] // ...e..............................................................................................................................................................................................|........................................................................................e............................................................................................................... + // ldr q12, [x1, #(4*(512/8))] // ..........................................................e.......................................................................................................................................|...............................................................................................................................................e........................................................ + // ldr q13, [x1, #(5*(512/8))] // ..............................................................................................................e...................................................................................|...................................................................................................................................................................................................e.... + // ldr q14, [x1, #(6*(512/8))] // ......................................................................................................................................e...........................................................|........................................................................................................................................................................................................ + // ldr q15, [x1, #(7*(512/8))] // ..............................................................................................................................e...................................................................|........................................................................................................................................................................................................ + // ldr q16, [x1, #(8*(512/8))] // ............................................................................e.....................................................................................................................|.................................................................................................................................................................e...................................... + // ldr q17, [x1, #(9*(512/8))] // ..............................................................................e...................................................................................................................|...................................................................................................................................................................e.................................... + // ldr q18, [x1, #(10*(512/8))] // ...........................e......................................................................................................................................................................|................................................................................................................e....................................................................................... + // ldr q19, [x1, #(11*(512/8))] // ............................................e.....................................................................................................................................................|.................................................................................................................................e...................................................................... + // ldr q20, [x1, #(12*(512/8))] // ...............................................................e..................................................................................................................................|....................................................................................................................................................e................................................... + // ldr q21, [x1, #(13*(512/8))] // ..................................................................e...............................................................................................................................|.......................................................................................................................................................e................................................ + // ldr q22, [x1, #(14*(512/8))] // .................................................e................................................................................................................................................|......................................................................................................................................e................................................................. + // ldr q23, [x1, #(15*(512/8))] // ..................................................................................e...............................................................................................................|.......................................................................................................................................................................e................................ + // sub v24.4s, v8.4s, v9.4s // ............................e.....................................................................................................................................................................|.................................................................................................................e...................................................................................... + // add v8.4s, v8.4s, v9.4s // .................................e................................................................................................................................................................|......................................................................................................................e................................................................................. + // mul v9.4s, v24.4s, v3.s[2] // ...............................e..................................................................................................................................................................|....................................................................................................................e................................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[3] // ................................e.................................................................................................................................................................|.....................................................................................................................e.................................................................................. + // mls v9.4s, v24.4s, v29.4s // ....................................e.............................................................................................................................................................|.........................................................................................................................e.............................................................................. + // sub v24.4s, v10.4s, v11.4s // ..................e...............................................................................................................................................................................|.......................................................................................................e................................................................................................ + // add v10.4s, v10.4s, v11.4s // ....................e.............................................................................................................................................................................|.........................................................................................................e.............................................................................................. + // mul v11.4s, v24.4s, v4.s[0] // .............................e....................................................................................................................................................................|..................................................................................................................e..................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.s[1] // ..............................e...................................................................................................................................................................|...................................................................................................................e.................................................................................... + // mls v11.4s, v24.4s, v29.4s // ...................................e..............................................................................................................................................................|........................................................................................................................e............................................................................... + // sub v24.4s, v12.4s, v13.4s // .................................................................................................................e................................................................................|......................................................................................................................................................................................................e. + // add v12.4s, v12.4s, v13.4s // ......................................................................................................................e...........................................................................|........................................................................................................................................................................................................ + // mul v13.4s, v24.4s, v4.s[2] // ....................................................................................................................e.............................................................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v4.s[3] // .....................................................................................................................e............................................................................|........................................................................................................................................................................................................ + // mls v13.4s, v24.4s, v29.4s // .........................................................................................................................e........................................................................|........................................................................................................................................................................................................ + // sub v24.4s, v14.4s, v15.4s // ..........................................................................................................................................e.......................................................|........................................................................................................................................................................................................ + // add v14.4s, v14.4s, v15.4s // ............................................................................................................................................e.....................................................|........................................................................................................................................................................................................ + // mul v15.4s, v24.4s, v5.s[0] // .............................................................................................................................................e....................................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v5.s[1] // ..............................................................................................................................................e...................................................|........................................................................................................................................................................................................ + // mls v15.4s, v24.4s, v29.4s // ..................................................................................................................................................e...............................................|........................................................................................................................................................................................................ + // sub v24.4s, v16.4s, v17.4s // ...................................................................................e..............................................................................................................|........................................................................................................................................................................e............................... + // add v16.4s, v16.4s, v17.4s // .................................................................................e................................................................................................................|......................................................................................................................................................................e................................. + // mul v17.4s, v24.4s, v5.s[2] // ...........................................................................................e......................................................................................................|................................................................................................................................................................................e....................... + // sqrdmulh v24.4s, v24.4s, v5.s[3] // ............................................................................................e.....................................................................................................|.................................................................................................................................................................................e...................... + // mls v17.4s, v24.4s, v29.4s // ................................................................................................e.................................................................................................|.....................................................................................................................................................................................e.................. + // sub v24.4s, v18.4s, v19.4s // ..................................................e...............................................................................................................................................|.......................................................................................................................................e................................................................ + // add v18.4s, v18.4s, v19.4s // ....................................................e.............................................................................................................................................|.........................................................................................................................................e.............................................................. + // mul v19.4s, v24.4s, v6.s[0] // .......................................................................................................e..........................................................................................|............................................................................................................................................................................................e........... + // sqrdmulh v24.4s, v24.4s, v6.s[1] // ........................................................................................................e.........................................................................................|.............................................................................................................................................................................................e.......... + // mls v19.4s, v24.4s, v29.4s // ............................................................................................................e.....................................................................................|.................................................................................................................................................................................................e...... + // sub v24.4s, v20.4s, v21.4s // .........................................................................e........................................................................................................................|..............................................................................................................................................................e......................................... + // add v20.4s, v20.4s, v21.4s // ........................................................................e.........................................................................................................................|.............................................................................................................................................................e.......................................... + // mul v21.4s, v24.4s, v6.s[2] // ......................................................................................e...........................................................................................................|...........................................................................................................................................................................e............................ + // sqrdmulh v24.4s, v24.4s, v6.s[3] // ....................................................................................e.............................................................................................................|.........................................................................................................................................................................e.............................. + // mls v21.4s, v24.4s, v29.4s // ..........................................................................................e.......................................................................................................|...............................................................................................................................................................................e........................ + // sub v24.4s, v22.4s, v23.4s // .....................................................................................e............................................................................................................|..........................................................................................................................................................................e............................. + // add v22.4s, v22.4s, v23.4s // .......................................................................................e..........................................................................................................|............................................................................................................................................................................e........................... + // mul v23.4s, v24.4s, v7.s[0] // ........................................................................................e.........................................................................................................|.............................................................................................................................................................................e.......................... + // sqrdmulh v24.4s, v24.4s, v7.s[1] // .........................................................................................e........................................................................................................|..............................................................................................................................................................................e......................... + // mls v23.4s, v24.4s, v29.4s // .............................................................................................e....................................................................................................|..................................................................................................................................................................................e..................... + // sub v24.4s, v8.4s, v10.4s // .....................................e............................................................................................................................................................|..........................................................................................................................e............................................................................. + // add v8.4s, v8.4s, v10.4s // .......................................e..........................................................................................................................................................|............................................................................................................................e........................................................................... + // mul v10.4s, v24.4s, v1.s[2] // ..........................................................................e.......................................................................................................................|...............................................................................................................................................................e........................................ + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .............................................................e....................................................................................................................................|..................................................................................................................................................e..................................................... + // mls v10.4s, v24.4s, v29.4s // .............................................................................e....................................................................................................................|..................................................................................................................................................................e..................................... + // sub v24.4s, v9.4s, v11.4s // ...............................................e..................................................................................................................................................|....................................................................................................................................e................................................................... + // add v9.4s, v9.4s, v11.4s // ..............................................e...................................................................................................................................................|...................................................................................................................................e.................................................................... + // mul v11.4s, v24.4s, v1.s[2] // ..................................................................................................................................e...............................................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...........................................................................................................e......................................................................................|................................................................................................................................................................................................e....... + // mls v11.4s, v24.4s, v29.4s // .......................................................................................................................................e..........................................................|........................................................................................................................................................................................................ + // sub v24.4s, v12.4s, v14.4s // ................................................................................................................................................e.................................................|........................................................................................................................................................................................................ + // add v12.4s, v12.4s, v14.4s // ..............................................................................................................................................................e...................................|........................................................................................................................................................................................................ + // mul v14.4s, v24.4s, v2.s[0] // ...................................................................................................................................................e..............................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ....................................................................................................................................................e.............................................|........................................................................................................................................................................................................ + // mls v14.4s, v24.4s, v29.4s // ........................................................................................................................................................e.........................................|........................................................................................................................................................................................................ + // sub v24.4s, v13.4s, v15.4s // .......................................................................................................................................................e..........................................|........................................................................................................................................................................................................ + // add v13.4s, v13.4s, v15.4s // ......................................................................................................................................................e...........................................|........................................................................................................................................................................................................ + // mul v15.4s, v24.4s, v2.s[0] // ................................................................................................................................................................e.................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .......................................................................................................................................................................e..........................|........................................................................................................................................................................................................ + // mls v15.4s, v24.4s, v29.4s // .............................................................................................................................................................................e....................|........................................................................................................................................................................................................ + // sub v24.4s, v16.4s, v18.4s // ..........................................................................................................e.......................................................................................|...............................................................................................................................................................................................e........ + // add v16.4s, v16.4s, v18.4s // ...............................................................................................................e..................................................................................|....................................................................................................................................................................................................e... + // mul v18.4s, v24.4s, v2.s[2] // .................................................................................................................................................e................................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // .............................................................................................................e....................................................................................|..................................................................................................................................................................................................e..... + // mls v18.4s, v24.4s, v29.4s // .....................................................................................................................................................e............................................|........................................................................................................................................................................................................ + // sub v24.4s, v17.4s, v19.4s // ...................................................................................................................................e..............................................................|........................................................................................................................................................................................................ + // add v17.4s, v17.4s, v19.4s // ...................................................................................................................e..............................................................................|........................................................................................................................................................................................................ + // mul v19.4s, v24.4s, v2.s[2] // ........................................................................................................................................e.........................................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...........................................................................................................................................e......................................................|........................................................................................................................................................................................................ + // mls v19.4s, v24.4s, v29.4s // ...............................................................................................................................................e..................................................|........................................................................................................................................................................................................ + // sub v24.4s, v20.4s, v22.4s // .............................................................................................................................e....................................................................|........................................................................................................................................................................................................ + // add v20.4s, v20.4s, v22.4s // ............................................................................................................................e.....................................................................|........................................................................................................................................................................................................ + // mul v22.4s, v24.4s, v3.s[0] // ....................................................................................................................................e.............................................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .....................................................................................................................................e............................................................|........................................................................................................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // .........................................................................................................................................e........................................................|........................................................................................................................................................................................................ + // sub v24.4s, v21.4s, v23.4s // .................................................................................................e................................................................................................|......................................................................................................................................................................................e................. + // add v21.4s, v21.4s, v23.4s // ..................................................................................................e...............................................................................................|.......................................................................................................................................................................................e................ + // mul v23.4s, v24.4s, v3.s[0] // ....................................................................................................e.............................................................................................|.........................................................................................................................................................................................e.............. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .....................................................................................................e............................................................................................|..........................................................................................................................................................................................e............. + // mls v23.4s, v24.4s, v29.4s // .........................................................................................................e........................................................................................|..............................................................................................................................................................................................e......... + // sub v24.4s, v8.4s, v12.4s // ....................................................................................................................................................................e.............................|........................................................................................................................................................................................................ + // add v8.4s, v8.4s, v12.4s // ......................................................................................................................................................................e...........................|........................................................................................................................................................................................................ + // mul v12.4s, v24.4s, v0.s[2] // ............................................................................................................................................................................e.....................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...............................................................................................................................................................................e..................|........................................................................................................................................................................................................ + // mls v12.4s, v24.4s, v29.4s // ....................................................................................................................................................................................e.............|........................................................................................................................................................................................................ + // sub v24.4s, v9.4s, v13.4s // .........................................................................................................................................................e........................................|........................................................................................................................................................................................................ + // add v9.4s, v9.4s, v13.4s // ...........................................................................................................................................................e......................................|........................................................................................................................................................................................................ + // mul v13.4s, v24.4s, v0.s[2] // ............................................................................................................................................................e.....................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................................................................................................................................e....................................|........................................................................................................................................................................................................ + // mls v13.4s, v24.4s, v29.4s // .................................................................................................................................................................e................................|........................................................................................................................................................................................................ + // sub v24.4s, v10.4s, v14.4s // ...................................................................................................................................................................e..............................|........................................................................................................................................................................................................ + // add v10.4s, v10.4s, v14.4s // ..................................................................................................................................................................e...............................|........................................................................................................................................................................................................ + // mul v14.4s, v24.4s, v0.s[2] // ..................................................................................................................................................................................................|.....*.................................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................................................................................................................................................................................|......*................................................................................................................................................................................................. + // mls v14.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|..........*............................................................................................................................................................................................. + // sub v24.4s, v11.4s, v15.4s // ..................................................................................................................................................................................................|.................*...................................................................................................................................................................................... + // add v11.4s, v11.4s, v15.4s // ..................................................................................................................................................................................................|................*....................................................................................................................................................................................... + // mul v15.4s, v24.4s, v0.s[2] // ..................................................................................................................................................................................................|....................*................................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................................................................................................................................................................................|................................*....................................................................................................................................................................... + // mls v15.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|.....................................*.................................................................................................................................................................. + // sub v24.4s, v16.4s, v20.4s // .................................................................................................................................e................................................................|........................................................................................................................................................................................................ + // add v16.4s, v16.4s, v20.4s // ................................................................................................................................e.................................................................|........................................................................................................................................................................................................ + // mul v20.4s, v24.4s, v1.s[0] // ......................................................................................................................................................................................e...........|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .......................................................................................................................................................................................e..........|........................................................................................................................................................................................................ + // mls v20.4s, v24.4s, v29.4s // ...........................................................................................................................................................................................e......|........................................................................................................................................................................................................ + // sub v24.4s, v17.4s, v21.4s // .......................................................................................................................e..........................................................................|........................................................................................................................................................................................................ + // add v17.4s, v17.4s, v21.4s // ........................................................................................................................e.........................................................................|........................................................................................................................................................................................................ + // mul v21.4s, v24.4s, v1.s[0] // ...........................................................................................................................e......................................................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................................................................................................................e.......................................................................|........................................................................................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // ...............................................................................................................................e..................................................................|........................................................................................................................................................................................................ + // sub v24.4s, v18.4s, v22.4s // ..........................................................................................................................................................e.......................................|........................................................................................................................................................................................................ + // add v18.4s, v18.4s, v22.4s // ...............................................................................................................................................................e..................................|........................................................................................................................................................................................................ + // mul v22.4s, v24.4s, v1.s[0] // .........................................................................................................................................................................e........................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................................................................................e...................|........................................................................................................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // ..................................................................................................................................................................................e...............|........................................................................................................................................................................................................ + // sub v24.4s, v19.4s, v23.4s // .....................................................................................................................................................................................e............|........................................................................................................................................................................................................ + // add v19.4s, v19.4s, v23.4s // ...................................................................................................................................................................................e..............|........................................................................................................................................................................................................ + // mul v23.4s, v24.4s, v1.s[0] // .................................................................................................................................................................................................e|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................................................................................................................................................................................................e.|........................................................................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|.*...................................................................................................................................................................................................... + // sub v24.4s, v8.4s, v16.4s // ..................................................................................................................................................................................................|...........................*............................................................................................................................................................................ + // add v8.4s, v8.4s, v16.4s // ..................................................................................................................................................................................................|............................*........................................................................................................................................................................... + // mul v16.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|..............................*......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|...............................*........................................................................................................................................................................ + // mls v16.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|...................................*.................................................................................................................................................................... + // sub v24.4s, v9.4s, v17.4s // ...........................................................................................................................................................................e......................|........................................................................................................................................................................................................ + // add v9.4s, v9.4s, v17.4s // ..........................................................................................................................................................................e.......................|........................................................................................................................................................................................................ + // mul v17.4s, v24.4s, v0.s[0] // ................................................................................................................................................................................e.................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................................................................................................................................................................e................|........................................................................................................................................................................................................ + // mls v17.4s, v24.4s, v29.4s // ........................................................................................................................................................................................e.........|........................................................................................................................................................................................................ + // sub v24.4s, v10.4s, v18.4s // ..................................................................................................................................................................................................|..*..................................................................................................................................................................................................... + // add v10.4s, v10.4s, v18.4s // ..................................................................................................................................................................................................|.......*................................................................................................................................................................................................ + // mul v18.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|..............*......................................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|............*........................................................................................................................................................................................... + // mls v18.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|..................*..................................................................................................................................................................................... + // sub v24.4s, v11.4s, v19.4s // ..................................................................................................................................................................................................|........................................................*............................................................................................................................................... + // add v11.4s, v11.4s, v19.4s // ..................................................................................................................................................................................................|.................................................*...................................................................................................................................................... + // mul v19.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|.....................................................................*.................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|.................................................................*...................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|.........................................................................*.............................................................................................................................. + // sub v24.4s, v12.4s, v20.4s // ..................................................................................................................................................................................................|.....................*.................................................................................................................................................................................. + // add v12.4s, v12.4s, v20.4s // ..................................................................................................................................................................................................|...................*.................................................................................................................................................................................... + // mul v20.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|..................................................................................*..................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|..........................................................*............................................................................................................................................. + // mls v20.4s, v24.4s, v29.4s // ..........*.......................................................................................................................................................................................|...............................................................................................*........................................................................................................ + // sub v24.4s, v13.4s, v21.4s // .....................................................................................................................................................................e............................|........................................................................................................................................................................................................ + // add v13.4s, v13.4s, v21.4s // ........................................................................................................................................................................e.........................|........................................................................................................................................................................................................ + // mul v21.4s, v24.4s, v0.s[0] // ..........................................................................................................................................................................................e.......|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................................................................................................................................................................................e........|........................................................................................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // ..............................................................................................................................................................................................e...|........................................................................................................................................................................................................ + // sub v24.4s, v14.4s, v22.4s // ..................................................................................................................................................................................................|.......................*................................................................................................................................................................................ + // add v14.4s, v14.4s, v22.4s // ..................................................................................................................................................................................................|......................*................................................................................................................................................................................. + // mul v22.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|...................................................................................*.................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|....................................................................................*................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ..*...............................................................................................................................................................................................|.......................................................................................*................................................................................................................ + // sub v24.4s, v15.4s, v23.4s // ..................................................................................................................................................................................................|............................................................................*........................................................................................................................... + // add v15.4s, v15.4s, v23.4s // ..................................................................................................................................................................................................|...............................................................*........................................................................................................................................ + // mul v23.4s, v24.4s, v0.s[0] // ........................................*.........................................................................................................................................................|.............................................................................................................................*.......................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|................................................................................*....................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // .....................................................*............................................................................................................................................|..........................................................................................................................................*............................................................. + // cmge v27.4s, v31.4s, v16.4s // ..................................................................................................................................................................................................|.......................................*................................................................................................................................................................ + // cmge v28.4s, v16.4s, v30.4s // ..................................................................................................................................................................................................|........................................*............................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|..........................................*............................................................................................................................................................. + // mls v16.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|....................................................*................................................................................................................................................... + // cmge v27.4s, v31.4s, v17.4s // ............................................................................................................................................................................................e.....|........................................................................................................................................................................................................ + // cmge v28.4s, v17.4s, v30.4s // .............................................................................................................................................................................................e....|........................................................................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ...............................................................................................................................................................................................e..|........................................................................................................................................................................................................ + // mls v17.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|...*.................................................................................................................................................................................................... + // cmge v27.4s, v31.4s, v18.4s // .........*........................................................................................................................................................................................|..............................................................................................*......................................................................................................... + // cmge v28.4s, v18.4s, v30.4s // ..................................................................................................................................................................................................|.....................................................*.................................................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ............*.....................................................................................................................................................................................|.................................................................................................*...................................................................................................... + // mls v18.4s, v28.4s, v29.4s // ...............*..................................................................................................................................................................................|....................................................................................................*................................................................................................... + // cmge v27.4s, v31.4s, v19.4s // .......................*..........................................................................................................................................................................|............................................................................................................*........................................................................................... + // cmge v28.4s, v19.4s, v30.4s // .......*..........................................................................................................................................................................................|............................................................................................*........................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .........................*........................................................................................................................................................................|..............................................................................................................*......................................................................................... + // mls v19.4s, v28.4s, v29.4s // ..................................*...............................................................................................................................................................|.......................................................................................................................*................................................................................ + // cmge v27.4s, v31.4s, v20.4s // .........................................*........................................................................................................................................................|..............................................................................................................................*......................................................................... + // cmge v28.4s, v20.4s, v30.4s // ........................*.........................................................................................................................................................................|.............................................................................................................*.......................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...........................................*......................................................................................................................................................|................................................................................................................................*....................................................................... + // mls v20.4s, v28.4s, v29.4s // .............................................*....................................................................................................................................................|..................................................................................................................................*..................................................................... + // cmge v27.4s, v31.4s, v21.4s // ..................................................................................................................................................................................................|.............*.......................................................................................................................................................................................... + // cmge v28.4s, v21.4s, v30.4s // ..................................................................................................................................................................................................|.......................................................*................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|.........................................................*.............................................................................................................................................. + // mls v21.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|............................................................*........................................................................................................................................... + // cmge v27.4s, v31.4s, v22.4s // ......*...........................................................................................................................................................................................|...........................................................................................*............................................................................................................ + // cmge v28.4s, v22.4s, v30.4s // .....*............................................................................................................................................................................................|..........................................................................................*............................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ........*.........................................................................................................................................................................................|.............................................................................................*.......................................................................................................... + // mls v22.4s, v28.4s, v29.4s // ...........*......................................................................................................................................................................................|................................................................................................*....................................................................................................... + // cmge v27.4s, v31.4s, v23.4s // ...........................................................................*......................................................................................................................|................................................................................................................................................................*....................................... + // cmge v28.4s, v23.4s, v30.4s // ....................................................................*.............................................................................................................................|.........................................................................................................................................................*.............................................. + // sub v28.4s, v27.4s, v28.4s // ...............................................................................*..................................................................................................................|....................................................................................................................................................................*................................... + // mls v23.4s, v28.4s, v29.4s // ..............................................................................................*...................................................................................................|...................................................................................................................................................................................*.................... + // str q16, [x1, #(8*(512/8))] // ..................................................................................................................................................................................................|.............................................................................*.......................................................................................................................... + // str q17, [x1, #(9*(512/8))] // ..................................................................................................................................................................................................|...............*........................................................................................................................................................................................ + // str q18, [x1, #(10*(512/8))] // ...................*..............................................................................................................................................................................|........................................................................................................*............................................................................................... + // str q19, [x1, #(11*(512/8))] // ......................................*...........................................................................................................................................................|...........................................................................................................................*............................................................................ + // str q20, [x1, #(12*(512/8))] // ................................................................................*.................................................................................................................|.....................................................................................................................................................................*.................................. + // str q21, [x1, #(13*(512/8))] // .............*....................................................................................................................................................................................|..................................................................................................*..................................................................................................... + // str q22, [x1, #(14*(512/8))] // ................*.................................................................................................................................................................................|.....................................................................................................*.................................................................................................. + // str q23, [x1, #(15*(512/8))] // ..................................................................................................................*...............................................................................|.......................................................................................................................................................................................................* + // mul v16.4s, v8.4s, v25.4s // ..................................................................................................................................................................................................|..............................................*......................................................................................................................................................... + // sqrdmulh v8.4s, v8.4s, v26.4s // ..................................................................................................................................................................................................|...............................................*........................................................................................................................................................ + // mls v16.4s, v8.4s, v29.4s // ..................................................................................................................................................................................................|...................................................*.................................................................................................................................................... + // mul v17.4s, v9.4s, v25.4s // ..................................................................................................................................................................................................|............................................*........................................................................................................................................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ..................................................................................................................................................................................................|...........................................*............................................................................................................................................................ + // mls v17.4s, v9.4s, v29.4s // ..................................................................................................................................................................................................|................................................*....................................................................................................................................................... + // mul v18.4s, v10.4s, v25.4s // ..................................................................................................................................................................................................|..................................................*..................................................................................................................................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ..................................................................................................................................................................................................|..............................................................*......................................................................................................................................... + // mls v18.4s, v10.4s, v29.4s // ..................................................................................................................................................................................................|..................................................................*..................................................................................................................................... + // mul v19.4s, v11.4s, v25.4s // ..........................................*.......................................................................................................................................................|...............................................................................................................................*........................................................................ + // sqrdmulh v11.4s, v11.4s, v26.4s // ................................................*.................................................................................................................................................|.....................................................................................................................................*.................................................................. + // mls v19.4s, v11.4s, v29.4s // ...................................................*..............................................................................................................................................|........................................................................................................................................*............................................................... + // mul v20.4s, v12.4s, v25.4s // ..............*...................................................................................................................................................................................|...................................................................................................*.................................................................................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // .................*................................................................................................................................................................................|......................................................................................................*................................................................................................. + // mls v20.4s, v12.4s, v29.4s // ........................................................*.........................................................................................................................................|.............................................................................................................................................*.......................................................... + // mul v21.4s, v13.4s, v25.4s // ..................................................................................................................................................................................................*........................................................................................................................................................................................................ + // sqrdmulh v13.4s, v13.4s, v26.4s // ..................................................................................................................................................................................................|*....................................................................................................................................................................................................... + // mls v21.4s, v13.4s, v29.4s // ..................................................................................................................................................................................................|....*................................................................................................................................................................................................... + // mul v22.4s, v14.4s, v25.4s // ..................................................................................................................................................................................................|..........................*............................................................................................................................................................................. + // sqrdmulh v14.4s, v14.4s, v26.4s // ..................................................................................................................................................................................................|.........................*.............................................................................................................................................................................. + // mls v22.4s, v14.4s, v29.4s // ..................................................................................................................................................................................................|.............................*.......................................................................................................................................................................... + // mul v23.4s, v15.4s, v25.4s // ..................................................................................................................................................................................................|...................................................................*.................................................................................................................................... + // sqrdmulh v15.4s, v15.4s, v26.4s // ..................................................................................................................................................................................................|......................................................................*................................................................................................................................. + // mls v23.4s, v15.4s, v29.4s // ..................................................................................................................................................................................................|..........................................................................*............................................................................................................................. + // cmge v27.4s, v31.4s, v16.4s // .......................................................*..........................................................................................................................................|............................................................................................................................................*........................................................... + // cmge v28.4s, v16.4s, v30.4s // ......................................................*...........................................................................................................................................|...........................................................................................................................................*............................................................ + // sub v28.4s, v27.4s, v28.4s // .........................................................*........................................................................................................................................|..............................................................................................................................................*......................................................... + // mls v16.4s, v28.4s, v29.4s // .................................................................*................................................................................................................................|......................................................................................................................................................*................................................. + // cmge v27.4s, v31.4s, v17.4s // ..................................................................................................................................................................................................|...........................................................*............................................................................................................................................ + // cmge v28.4s, v17.4s, v30.4s // ..................................................................................................................................................................................................|......................................................*................................................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|.............................................................*.......................................................................................................................................... + // mls v17.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|................................................................*....................................................................................................................................... + // cmge v27.4s, v31.4s, v18.4s // ..................................................................................................................................................................................................|........................................................................*............................................................................................................................... + // cmge v28.4s, v18.4s, v30.4s // ..................................................................................................................................................................................................|.......................................................................*................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|...........................................................................*............................................................................................................................ + // mls v18.4s, v28.4s, v29.4s // .*................................................................................................................................................................................................|......................................................................................*................................................................................................................. + // cmge v27.4s, v31.4s, v19.4s // ......................................................................*...........................................................................................................................|...........................................................................................................................................................*............................................ + // cmge v28.4s, v19.4s, v30.4s // ...............................................................................................*..................................................................................................|....................................................................................................................................................................................*................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................*..............................................................................................|........................................................................................................................................................................................*............... + // mls v19.4s, v28.4s, v29.4s // ......................................................................................................*...........................................................................................|...........................................................................................................................................................................................*............ + // cmge v27.4s, v31.4s, v20.4s // ...........................................................*......................................................................................................................................|................................................................................................................................................*....................................................... + // cmge v28.4s, v20.4s, v30.4s // ............................................................*.....................................................................................................................................|.................................................................................................................................................*...................................................... + // sub v28.4s, v27.4s, v28.4s // ..............................................................*...................................................................................................................................|...................................................................................................................................................*.................................................... + // mls v20.4s, v28.4s, v29.4s // ................................................................*.................................................................................................................................|.....................................................................................................................................................*.................................................. + // cmge v27.4s, v31.4s, v21.4s // ..................................................................................................................................................................................................|........*............................................................................................................................................................................................... + // cmge v28.4s, v21.4s, v30.4s // ..................................................................................................................................................................................................|.........*.............................................................................................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|...........*............................................................................................................................................................................................ + // mls v21.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|........................*............................................................................................................................................................................... + // cmge v27.4s, v31.4s, v22.4s // ..................................................................................................................................................................................................|.................................*...................................................................................................................................................................... + // cmge v28.4s, v22.4s, v30.4s // ..................................................................................................................................................................................................|....................................*................................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|......................................*................................................................................................................................................................. + // mls v22.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|.........................................*.............................................................................................................................................................. + // cmge v27.4s, v31.4s, v23.4s // ..................................................................................................................................................................................................|..............................................................................*......................................................................................................................... + // cmge v28.4s, v23.4s, v30.4s // ..................................................................................................................................................................................................|...............................................................................*........................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|.................................................................................*...................................................................................................................... + // mls v23.4s, v28.4s, v29.4s // ....*.............................................................................................................................................................................................|.........................................................................................*.............................................................................................................. + // str q16, [x1], #(16) // .....................................................................*............................................................................................................................|..........................................................................................................................................................*............................................. + // str q17, [x1, #(-16 + 1*(512/8))] // ..................................................................................................................................................................................................|....................................................................*................................................................................................................................... + // str q18, [x1, #(-16 + 2*(512/8))] // .......................................................................*..........................................................................................................................|............................................................................................................................................................*........................................... + // str q19, [x1, #(-16 + 3*(512/8))] // ................................................................................................................*.................................................................................|.....................................................................................................................................................................................................*.. + // str q20, [x1, #(-16 + 4*(512/8))] // ...................................................................*..............................................................................................................................|........................................................................................................................................................*............................................... + // str q21, [x1, #(-16 + 5*(512/8))] // ..................................................................................................................................................................................................|..................................*..................................................................................................................................................................... + // str q22, [x1, #(-16 + 6*(512/8))] // ..................................................................................................................................................................................................|.............................................*.......................................................................................................................................................... + // str q23, [x1, #(-16 + 7*(512/8))] // ......................*...........................................................................................................................................................................|...........................................................................................................*............................................................................................ + + sub count, count, #1 + cbnz count, layer1234_start + sub v21.4S, v16.4S, v22.4S // ...............................................................................................................*........................................................................................................................................................................ + mls v19.4S, v11.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mls v23.4S, v27.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sqrdmulh v11.4S, v21.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + mul v27.4S, v21.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + sub v21.4S, v12.4S, v20.4S // ............................................................................................................................................................*........................................................................................................................... + str q23, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + add v23.4S, v16.4S, v22.4S // ................................................................................................................*....................................................................................................................................................................... + mls v27.4S, v11.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + add v12.4S, v12.4S, v20.4S // .............................................................................................................................................................*.......................................................................................................................... + add v11.4S, v24.4S, v13.4S // .........................................................................................................................................*.............................................................................................................................................. + sub v22.4S, v24.4S, v13.4S // ........................................................................................................................................*............................................................................................................................................... + add v24.4S, v27.4S, v19.4S // ............................................................................................................................................................................*........................................................................................................... + mul v20.4S, v17.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + sqrdmulh v13.4S, v22.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + sqrdmulh v16.4S, v17.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + mul v17.4S, v22.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + sqrdmulh v22.4S, v11.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + sub v27.4S, v27.4S, v19.4S // ...........................................................................................................................................................................*............................................................................................................ + mls v20.4S, v16.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + mls v17.4S, v13.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + mul v13.4S, v27.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + mul v19.4S, v11.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sqrdmulh v16.4S, v27.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + cmge v27.4S, v17.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + cmge v11.4S, v31.4S, v17.4S // ................................................................................................................................................................................*....................................................................................................... + mls v19.4S, v22.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sub v27.4S, v11.4S, v27.4S // ..................................................................................................................................................................................*..................................................................................................... + mls v13.4S, v16.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + sqrdmulh v22.4S, v8.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + mul v16.4S, v8.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + mls v17.4S, v27.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + cmge v27.4S, v13.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + cmge v8.4S, v31.4S, v13.4S // ............................................................................................................................................................................................................*........................................................................... + mls v16.4S, v22.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + sub v8.4S, v8.4S, v27.4S // ..............................................................................................................................................................................................................*......................................................................... + cmge v22.4S, v31.4S, v15.4S // ....................................................................................................................................................................................................*................................................................................... + cmge v11.4S, v15.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + mls v13.4S, v8.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + sub v11.4S, v22.4S, v11.4S // ......................................................................................................................................................................................................*................................................................................. + sub v8.4S, v20.4S, v14.4S // ......................................................................................................................................................................*................................................................................................................. + cmge v27.4S, v16.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + mls v15.4S, v11.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + add v14.4S, v20.4S, v14.4S // .......................................................................................................................................................................*................................................................................................................ + sqrdmulh v20.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + cmge v22.4S, v31.4S, v16.4S // ....................................................................................................................................................................................................................................................................*................... + str q17, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + sub v11.4S, v22.4S, v27.4S // ......................................................................................................................................................................................................................................................................*................. + mul v27.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + sqrdmulh v22.4S, v8.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mls v16.4S, v11.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + sqrdmulh v12.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + mls v27.4S, v20.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + mul v8.4S, v8.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + str q16, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + sqrdmulh v11.4S, v24.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + sub v20.4S, v23.4S, v10.4S // .......................................................................................................................................................*................................................................................................................................ + mls v8.4S, v22.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + mul v17.4S, v24.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + mul v24.4S, v20.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v20.4S, v20.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + cmge v16.4S, v8.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v14.4S, v31.4S, v8.4S // ........................................................................................................................................................................................................*............................................................................... + mls v24.4S, v20.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + sub v14.4S, v14.4S, v16.4S // ..........................................................................................................................................................................................................*............................................................................. + mls v22.4S, v12.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + sub v16.4S, v28.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... + mls v8.4S, v14.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mls v17.4S, v11.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + mul v12.4S, v16.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v16.4S, v16.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + str q8, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + sqrdmulh v20.4S, v21.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + cmge v14.4S, v17.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + mls v12.4S, v16.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + cmge v8.4S, v31.4S, v17.4S // ............................................................................................................................................................................................................................................................................*........... + mul v11.4S, v21.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + sub v8.4S, v8.4S, v14.4S // ..............................................................................................................................................................................................................................................................................*......... + cmge v14.4S, v31.4S, v12.4S // ........................................................................................................................................................................................*............................................................................................... + cmge v21.4S, v12.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + mls v17.4S, v8.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + sub v16.4S, v14.4S, v21.4S // ..........................................................................................................................................................................................*............................................................................................. + sqrdmulh v14.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mls v11.4S, v20.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + cmge v21.4S, v19.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + mls v12.4S, v16.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + cmge v16.4S, v31.4S, v19.4S // ................................................................................................................................................................................................................................................*....................................... + cmge v8.4S, v11.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + cmge v20.4S, v31.4S, v11.4S // ................................................................................................................................................................................................*....................................................................................... + str q12, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + sub v8.4S, v20.4S, v8.4S // ..................................................................................................................................................................................................*..................................................................................... + sub v21.4S, v16.4S, v21.4S // ..................................................................................................................................................................................................................................................*..................................... + cmge v16.4S, v31.4S, v27.4S // ................................................................................................................................................................................................................................................................*....................... + mls v11.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + mls v19.4S, v21.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + cmge v12.4S, v31.4S, v24.4S // ............................................................................................................................................................................................*........................................................................................... + cmge v21.4S, v27.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + str q11, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + sub v16.4S, v16.4S, v21.4S // ..................................................................................................................................................................................................................................................................*..................... + str q19, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + cmge v21.4S, v24.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + mls v27.4S, v16.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + cmge v16.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + sub v11.4S, v12.4S, v21.4S // ..............................................................................................................................................................................................*......................................................................................... + cmge v21.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + str q27, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + mls v24.4S, v11.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mul v11.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + sub v16.4S, v16.4S, v21.4S // ..........................................................................................................................................................................................................................................................................*............. + add v12.4S, v28.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... + str q24, [x1, #688] // ...................................................................................................................................................................................................................*.................................................................... + mls v22.4S, v16.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + mul v24.4S, v12.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + mls v11.4S, v14.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + sqrdmulh v14.4S, v12.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + add v27.4S, v23.4S, v10.4S // ........................................................................................................................................................*............................................................................................................................... + cmge v16.4S, v11.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + mls v24.4S, v14.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + mul v22.4S, v27.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + sqrdmulh v10.4S, v27.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + cmge v14.4S, v31.4S, v11.4S // ....................................................................................................................................................................................................................................................*................................... + cmge v19.4S, v31.4S, v24.4S // ........................................................................................................................................................................................................................................................*............................... + cmge v21.4S, v24.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + mls v22.4S, v10.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + sub v21.4S, v19.4S, v21.4S // ..........................................................................................................................................................................................................................................................*............................. + sub v28.4S, v14.4S, v16.4S // ......................................................................................................................................................................................................................................................*................................. + str q17, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + mls v24.4S, v21.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + str q13, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ + cmge v23.4S, v22.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + cmge v20.4S, v31.4S, v22.4S // ............................................................................................................................................................................................................................................................*........................... + str q24, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + sub v17.4S, v20.4S, v23.4S // ..............................................................................................................................................................................................................................................................*......................... + mls v11.4S, v28.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + str q15, [x1, #816] // .....................................................................................................................................................................................................................*.................................................................. + mls v22.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + str q11, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + str q22, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_a72.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_a72.s new file mode 100644 index 0000000..5c8b133 --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_a72.s @@ -0,0 +1,1810 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_manual_ld4_opt_a72 + .global _intt_dilithium_1234_5678_manual_ld4_opt_a72 + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_manual_ld4_opt_a72: +_intt_dilithium_1234_5678_manual_ld4_opt_a72: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 + ldr q4, [x0, #48] // .*................................................. + ldr q2, [x0, #32] // *.................................................. + // gap // ................................................... + ldr q22, [x0, #0] // ..*................................................ + ldr q3, [x0, #16] // ...*............................................... + // gap // ................................................... + ldr q30, [x4], #8 // ........................................*.......... + ldr q10, [x3, #80] // ......*............................................ + // gap // ................................................... + ldr q12, [x3, #32] // ....*.............................................. + // gap // ................................................... + // gap // ................................................... + trn2 v14.4S, v2.4S, v4.4S // ...........*....................................... + trn1 v2.4S, v2.4S, v4.4S // ........*.......................................... + ldr q19, [x3, #48] // .............*..................................... + trn2 v0.4S, v22.4S, v3.4S // ..........*........................................ + trn1 v25.4S, v22.4S, v3.4S // .........*......................................... + ldr q7, [x4], #16 // ..........................................*........ + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + trn2 v5.2D, v0.2D, v14.2D // ...............*................................... + trn2 v11.2D, v25.2D, v2.2D // .................*................................. + // gap // ................................................... + trn1 v28.2D, v25.2D, v2.2D // ............*...................................... + trn1 v3.2D, v0.2D, v14.2D // ..............*.................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + sub v15.4S, v11.4S, v5.4S // ....................*.............................. + // gap // ................................................... + // gap // ................................................... + add v8.4S, v28.4S, v3.4S // ...................*............................... + // gap // ................................................... + // gap // ................................................... + sub v2.4S, v28.4S, v3.4S // ................*.................................. + // gap // ................................................... + // gap // ................................................... + sqrdmulh v13.4S, v15.4S, v10.4S // .......................*........................... + ldr q10, [x3, #64] // .....*............................................. + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + sqrdmulh v18.4S, v2.4S, v19.4S // .....................*............................. + // gap // ................................................... + // gap // ................................................... + add v28.4S, v11.4S, v5.4S // ......................*............................ + // gap // ................................................... + // gap // ................................................... + mul v2.4S, v2.4S, v12.4S // ..................*................................ + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + mul v15.4S, v15.4S, v10.4S // .........................*......................... + ldr q10, [x3, #16] // .......*........................................... + sub v3.4S, v8.4S, v28.4S // ..........................*........................ + add v31.4S, v8.4S, v28.4S // ........................*.......................... + // gap // ................................................... + // gap // ................................................... + mls v2.4S, v18.4S, v29.4S // ...........................*....................... + ldr q18, [x3], #(6*16) // .............................*..................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + mls v15.4S, v13.4S, v29.4S // ............................*...................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + sqrdmulh v22.4S, v3.4S, v10.4S // ..............................*.................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + mul v21.4S, v3.4S, v18.4S // ................................*.................. + // gap // ................................................... + // gap // ................................................... + sub v5.4S, v2.4S, v15.4S // ...............................*................... + // gap // ................................................... + // gap // ................................................... + add v11.4S, v2.4S, v15.4S // ....................................*.............. + // gap // ................................................... + // gap // ................................................... + mls v21.4S, v22.4S, v29.4S // ..................................*................ + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + sqrdmulh v2.4S, v5.4S, v10.4S // .................................*................. + trn1 v0.4S, v31.4S, v11.4S // .......................................*........... + // gap // ................................................... + trn2 v11.4S, v31.4S, v11.4S // ......................................*............ + // gap // ................................................... + // gap // ................................................... + mul v18.4S, v5.4S, v18.4S // ...................................*............... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + mls v18.4S, v2.4S, v29.4S // .....................................*............. + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + trn2 v2.4S, v21.4S, v18.4S // ...........................................*....... + trn1 v27.4S, v21.4S, v18.4S // .........................................*......... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + trn2 v15.2D, v11.2D, v2.2D // .............................................*..... + trn2 v24.2D, v0.2D, v27.2D // ............................................*...... + trn1 v13.2D, v0.2D, v27.2D // ..............................................*.... + // gap // ................................................... + // gap // ................................................... + trn1 v27.2D, v11.2D, v2.2D // ...............................................*... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + sub v19.4S, v24.4S, v15.4S // ................................................*.. + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + mul v18.4S, v19.4S, v7.S[2] // ..................................................* + add v11.4S, v13.4S, v27.4S // .................................................*. + + // original source code + // ldr q21, [x0, #32] // .*................................................. + // ldr q16, [x0, #48] // *.................................................. + // ldr q25, [x0, #0] // ..*................................................ + // ldr q2, [x0, #16] // ...*............................................... + // ldr q28, [x3, #32] // ......*............................................ + // ldr q4, [x3, #64] // .....................*............................. + // ldr q13, [x3, #80] // .....*............................................. + // ldr q23, [x3, #16] // ..........................*........................ + // trn1 v14.4S, v21.4S, v16.4S // ........*.......................................... + // trn1 v19.4S, v25.4S, v2.4S // ...........*....................................... + // trn2 v9.4S, v25.4S, v2.4S // ..........*........................................ + // trn2 v17.4S, v21.4S, v16.4S // .......*........................................... + // trn1 v12.2D, v19.2D, v14.2D // ...............*................................... + // ldr q22, [x3, #48] // .........*......................................... + // trn1 v20.2D, v9.2D, v17.2D // ................*.................................. + // trn2 v1.2D, v9.2D, v17.2D // .............*..................................... + // sub v17.4S, v12.4S, v20.4S // ...................*............................... + // trn2 v2.2D, v19.2D, v14.2D // ..............*.................................... + // mul v8.4S, v17.4S, v28.4S // ........................*.......................... + // add v7.4S, v12.4S, v20.4S // ..................*................................ + // sub v6.4S, v2.4S, v1.4S // .................*................................. + // sqrdmulh v10.4S, v17.4S, v22.4S // ......................*............................ + // add v5.4S, v2.4S, v1.4S // .......................*........................... + // sqrdmulh v2.4S, v6.4S, v13.4S // ....................*.............................. + // add v31.4S, v7.4S, v5.4S // ............................*...................... + // mul v6.4S, v6.4S, v4.4S // .........................*......................... + // sub v28.4S, v7.4S, v5.4S // ...........................*....................... + // mls v8.4S, v10.4S, v29.4S // .............................*..................... + // mls v6.4S, v2.4S, v29.4S // ...............................*................... + // ldr q2, [x3], #(6*16) // ..............................*.................... + // sqrdmulh v10.4S, v28.4S, v23.4S // ................................*.................. + // sub v16.4S, v8.4S, v6.4S // ..................................*................ + // mul v22.4S, v28.4S, v2.4S // .................................*................. + // sqrdmulh v12.4S, v16.4S, v23.4S // .....................................*............. + // mls v22.4S, v10.4S, v29.4S // ....................................*.............. + // mul v23.4S, v16.4S, v2.4S // ........................................*.......... + // add v2.4S, v8.4S, v6.4S // ...................................*............... + // mls v23.4S, v12.4S, v29.4S // .........................................*......... + // trn2 v19.4S, v31.4S, v2.4S // .......................................*........... + // trn1 v2.4S, v31.4S, v2.4S // ......................................*............ + // ldr q30, [x4], #8 // ....*.............................................. + // trn1 v18.4S, v22.4S, v23.4S // ...........................................*....... + // ldr q7, [x4], #16 // ............*...................................... + // trn2 v14.4S, v22.4S, v23.4S // ..........................................*........ + // trn2 v24.2D, v2.2D, v18.2D // .............................................*..... + // trn2 v15.2D, v19.2D, v14.2D // ............................................*...... + // trn1 v13.2D, v2.2D, v18.2D // ..............................................*.... + // trn1 v27.2D, v19.2D, v14.2D // ...............................................*... + // sub v19.4S, v24.4S, v15.4S // ................................................*.. + // add v11.4S, v13.4S, v27.4S // ..................................................* + // mul v18.4S, v19.4S, v7.S[2] // .................................................*. + + sub count, count, #1 +layer5678_start: + sub v26.4S, v13.4S, v27.4S // ................................................*........................... + ldr q21, [x0, #96] // ..e......................................................................... + ldr q16, [x0, #112] // ...e........................................................................ + ldr q25, [x0, #64] // e........................................................................... + ldr q2, [x0, #80] // .e.......................................................................... + add v8.4S, v24.4S, v15.4S // ......................................................*..................... + sqrdmulh v10.4S, v19.4S, v7.S[3] // ........................................................*................... + ldr q28, [x3, #32] // ..............e............................................................. + ldr q4, [x3, #64] // ................e........................................................... + ldr q13, [x3, #80] // .................e.......................................................... + ldr q23, [x3, #16] // .............e.............................................................. + // gap // ............................................................................ + trn1 v14.4S, v21.4S, v16.4S // ......e..................................................................... + sqrdmulh v22.4S, v26.4S, v7.S[1] // ...................................................*........................ + // gap // ............................................................................ + trn1 v19.4S, v25.4S, v2.4S // ....e....................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + trn2 v9.4S, v25.4S, v2.4S // .....e...................................................................... + mul v26.4S, v26.4S, v7.S[0] // ..................................................*......................... + // gap // ............................................................................ + trn2 v17.4S, v21.4S, v16.4S // .......e.................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + mls v18.4S, v10.4S, v29.4S // .........................................................*.................. + trn1 v12.2D, v19.2D, v14.2D // ..........e................................................................. + // gap // ............................................................................ + sub v24.4S, v11.4S, v8.4S // ..........................................................*................. + // gap // ............................................................................ + // gap // ............................................................................ + mls v26.4S, v22.4S, v29.4S // ....................................................*....................... + ldr q22, [x3, #48] // ...............e............................................................ + trn1 v20.2D, v9.2D, v17.2D // ...........e................................................................ + trn2 v1.2D, v9.2D, v17.2D // .........e.................................................................. + // gap // ............................................................................ + // gap // ............................................................................ + add v3.4S, v11.4S, v8.4S // ...........................................................*................ + mul v0.4S, v24.4S, v30.S[0] // ............................................................*............... + // gap // ............................................................................ + sub v17.4S, v12.4S, v20.4S // ..................e......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v15.4S, v24.4S, v30.S[1] // .............................................................*.............. + trn2 v2.2D, v19.2D, v14.2D // ........e................................................................... + // gap // ............................................................................ + srshr v21.4S, v3.4S, #23 // ....................................................................*....... + // gap // ............................................................................ + // gap // ............................................................................ + mul v8.4S, v17.4S, v28.4S // ....................e....................................................... + add v7.4S, v12.4S, v20.4S // ...................e........................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v6.4S, v2.4S, v1.4S // .......................e.................................................... + sqrdmulh v10.4S, v17.4S, v22.4S // .....................e...................................................... + add v5.4S, v2.4S, v1.4S // ........................e................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v2.4S, v6.4S, v13.4S // ..........................e................................................. + // gap // ............................................................................ + // gap // ............................................................................ + add v31.4S, v7.4S, v5.4S // .............................e.............................................. + // gap // ............................................................................ + // gap // ............................................................................ + mul v6.4S, v6.4S, v4.4S // .........................e.................................................. + sub v28.4S, v7.4S, v5.4S // ............................e............................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v8.4S, v10.4S, v29.4S // ......................e..................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v6.4S, v2.4S, v29.4S // ...........................e................................................ + ldr q2, [x3], #(6*16) // ............e............................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v0.4S, v15.4S, v29.4S // ..............................................................*............. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v10.4S, v28.4S, v23.4S // ...............................e............................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v16.4S, v8.4S, v6.4S // .................................e.......................................... + // gap // ............................................................................ + // gap // ............................................................................ + mul v22.4S, v28.4S, v2.4S // ..............................e............................................. + // gap // ............................................................................ + // gap // ............................................................................ + str q0, [x0, #32] // ..........................................................................*. + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v12.4S, v16.4S, v23.4S // ....................................e....................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v11.4S, v26.4S, v18.4S // ...............................................................*............ + mls v22.4S, v10.4S, v29.4S // ................................e........................................... + add v10.4S, v26.4S, v18.4S // ................................................................*........... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v23.4S, v16.4S, v2.4S // ...................................e........................................ + add v2.4S, v8.4S, v6.4S // ..................................e......................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v23.4S, v12.4S, v29.4S // .....................................e...................................... + // gap // ............................................................................ + // gap // ............................................................................ + trn2 v19.4S, v31.4S, v2.4S // .......................................e.................................... + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v2.4S, v31.4S, v2.4S // ......................................e..................................... + // gap // ............................................................................ + sqrdmulh v8.4S, v11.4S, v30.S[1] // ..................................................................*......... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v1.4S, v11.4S, v30.S[0] // .................................................................*.......... + // gap // ............................................................................ + ldr q30, [x4], #8 // ..............................................e............................. + trn1 v18.4S, v22.4S, v23.4S // ........................................e................................... + ldr q7, [x4], #16 // ...............................................e............................ + // gap // ............................................................................ + mls v3.4S, v21.4S, v29.4S // .....................................................................*...... + trn2 v14.4S, v22.4S, v23.4S // .........................................e.................................. + // gap // ............................................................................ + srshr v22.4S, v10.4S, #23 // ......................................................................*..... + // gap // ............................................................................ + // gap // ............................................................................ + mls v1.4S, v8.4S, v29.4S // ...................................................................*........ + trn2 v24.2D, v2.2D, v18.2D // ..........................................e................................. + // gap // ............................................................................ + trn2 v15.2D, v19.2D, v14.2D // ...........................................e................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v10.4S, v22.4S, v29.4S // .......................................................................*.... + trn1 v13.2D, v2.2D, v18.2D // ............................................e............................... + // gap // ............................................................................ + trn1 v27.2D, v19.2D, v14.2D // .............................................e.............................. + str q3, [x0], #(16*4) // ........................................................................*... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v19.4S, v24.4S, v15.4S // .....................................................e...................... + str q1, [x0, #-16] // ...........................................................................* + // gap // ............................................................................ + // gap // ............................................................................ + add v11.4S, v13.4S, v27.4S // .................................................e.......................... + // gap // ............................................................................ + // gap // ............................................................................ + str q10, [x0, #-48] // .........................................................................*.. + mul v18.4S, v19.4S, v7.S[2] // .......................................................e.................... + // gap // ............................................................................ + + // original source code + // ldr q8, [x0, #(16*0)] // ..e........................................................................|..e....................................................................... + // ldr q9, [x0, #(16*1)] // ...e.......................................................................|...e...................................................................... + // ldr q10, [x0, #(16*2)] // e..........................................................................|e......................................................................... + // ldr q11, [x0, #(16*3)] // .e.........................................................................|.e........................................................................ + // trn1 v25.4s, v8.4s, v9.4s // ............e..............................................................|............e............................................................. + // trn2 v26.4s, v8.4s, v9.4s // .............e.............................................................|.............e............................................................ + // trn1 v27.4s, v10.4s, v11.4s // ..........e................................................................|..........e............................................................... + // trn2 v28.4s, v10.4s, v11.4s // ...............e...........................................................|...............e.......................................................... + // trn2 v10.2d, v25.2d, v27.2d // ...........................e...............................................|...........................e.............................................. + // trn2 v11.2d, v26.2d, v28.2d // ......................e....................................................|......................e................................................... + // trn1 v8.2d, v25.2d, v27.2d // .................e.........................................................|.................e........................................................ + // trn1 v9.2d, v26.2d, v28.2d // .....................e.....................................................|.....................e.................................................... + // ldr q0, [x3], #(6*16) // ........................................e..................................|........................................e................................. + // ldr q4, [x3, #(-6*16 + 1*16)] // .........e.................................................................|.........e................................................................ + // ldr q1, [x3, #(-6*16 + 2*16)] // ......e....................................................................|......e................................................................... + // ldr q5, [x3, #(-6*16 + 3*16)] // ....................e......................................................|....................e..................................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // .......e...................................................................|.......e.................................................................. + // ldr q6, [x3, #(-6*16 + 5*16)] // ........e..................................................................|........e................................................................. + // sub v24.4s, v8.4s, v9.4s // .........................e.................................................|.........................e................................................ + // add v8.4s, v8.4s, v9.4s // ..............................e............................................|..............................e........................................... + // mul v9.4s, v24.4s, v1.4s // .............................e.............................................|.............................e............................................ + // sqrdmulh v24.4s, v24.4s, v5.4s // ................................e..........................................|................................e......................................... + // mls v9.4s, v24.4s, v29.4s // ......................................e....................................|......................................e................................... + // sub v24.4s, v10.4s, v11.4s // ...............................e...........................................|...............................e.......................................... + // add v10.4s, v10.4s, v11.4s // .................................e.........................................|.................................e........................................ + // mul v11.4s, v24.4s, v2.4s // ....................................e......................................|....................................e..................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ..................................e........................................|..................................e....................................... + // mls v11.4s, v24.4s, v29.4s // .......................................e...................................|.......................................e.................................. + // sub v24.4s, v8.4s, v10.4s // .....................................e.....................................|.....................................e.................................... + // add v8.4s, v8.4s, v10.4s // ...................................e.......................................|...................................e...................................... + // mul v10.4s, v24.4s, v0.4s // ............................................e..............................|............................................e............................. + // sqrdmulh v24.4s, v24.4s, v4.4s // ..........................................e................................|..........................................e............................... + // mls v10.4s, v24.4s, v29.4s // ................................................e..........................|................................................e......................... + // sub v24.4s, v9.4s, v11.4s // ...........................................e...............................|...........................................e.............................. + // add v9.4s, v9.4s, v11.4s // ...................................................e.......................|...................................................e...................... + // mul v11.4s, v24.4s, v0.4s // ..................................................e........................|..................................................e....................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..............................................e............................|..............................................e........................... + // mls v11.4s, v24.4s, v29.4s // ....................................................e......................|....................................................e..................... + // trn1 v25.4s, v8.4s, v9.4s // ......................................................e....................|......................................................e................... + // trn2 v26.4s, v8.4s, v9.4s // .....................................................e.....................|.....................................................e.................... + // trn1 v27.4s, v10.4s, v11.4s // ..........................................................e................|..........................................................e............... + // trn2 v28.4s, v10.4s, v11.4s // .............................................................e.............|.............................................................e............ + // trn2 v10.2d, v25.2d, v27.2d // ................................................................e..........|................................................................e......... + // trn2 v11.2d, v26.2d, v28.2d // .................................................................e.........|.................................................................e........ + // trn1 v8.2d, v25.2d, v27.2d // ...................................................................e.......|...................................................................e...... + // trn1 v9.2d, v26.2d, v28.2d // ....................................................................e......|....................................................................e..... + // ldr q1, [x4], #8 // .........................................................e.................|.........................................................e................ + // ldr q0, [x4], #16 // ...........................................................e...............|...........................................................e.............. + // sub v24.4s, v8.4s, v9.4s // ...........................................................................*.......................................................................... + // add v8.4s, v8.4s, v9.4s // ........................................................................e..|........................................................................e. + // mul v9.4s, v24.4s, v0.s[0] // ..............*............................................................|..............*........................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........*...............................................................|...........*.............................................................. + // mls v9.4s, v24.4s, v29.4s // ...................*.......................................................|...................*...................................................... + // sub v24.4s, v10.4s, v11.4s // ......................................................................e....|......................................................................e... + // add v10.4s, v10.4s, v11.4s // ....*......................................................................|....*..................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ..........................................................................e|.......................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....*.....................................................................|.....*.................................................................... + // mls v11.4s, v24.4s, v29.4s // ................*..........................................................|................*......................................................... + // sub v24.4s, v8.4s, v10.4s // ..................*........................................................|..................*....................................................... + // add v8.4s, v8.4s, v10.4s // .......................*...................................................|.......................*.................................................. + // mul v10.4s, v24.4s, v1.s[0] // ........................*..................................................|........................*................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................*................................................|..........................*............................................... + // mls v10.4s, v24.4s, v29.4s // .........................................*.................................|.........................................*................................ + // sub v24.4s, v9.4s, v11.4s // ...............................................*...........................|...............................................*.......................... + // add v9.4s, v9.4s, v11.4s // .................................................*.........................|.................................................*........................ + // mul v11.4s, v24.4s, v1.s[0] // ........................................................*..................|........................................................*................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .......................................................*...................|.......................................................*.................. + // mls v11.4s, v24.4s, v29.4s // ...............................................................*...........|...............................................................*.......... + // srshr v24.4S, v8.4S, #23 // ............................*..............................................|............................*............................................. + // mls v8.4s, v24.4s, v29.4s // ............................................................*..............|............................................................*............. + // srshr v24.4S, v9.4S, #23 // ..............................................................*............|..............................................................*........... + // mls v9.4s, v24.4s, v29.4s // ..................................................................*........|..................................................................*....... + // str q8, [x0], #(16*4) // .....................................................................*.....|.....................................................................*.... + // str q9, [x0, #(-16*4 + 1*16)] // .........................................................................*.|.........................................................................* + // str q10, [x0, #(-16*4 + 2*16)] // .............................................*.............................|.............................................*............................ + // str q11, [x0, #(-16*4 + 3*16)] // .......................................................................*...|.......................................................................*.. + + sub count, count, #1 + cbnz count, layer5678_start + sub v25.4S, v13.4S, v27.4S // *........................ + sqrdmulh v14.4S, v19.4S, v7.S[3] // ..*...................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + add v15.4S, v24.4S, v15.4S // .*....................... + // gap // ......................... + // gap // ......................... + sqrdmulh v17.4S, v25.4S, v7.S[1] // ...*..................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + mul v26.4S, v25.4S, v7.S[0] // ....*.................... + sub v5.4S, v11.4S, v15.4S // ......*.................. + // gap // ......................... + add v24.4S, v11.4S, v15.4S // ........*................ + // gap // ......................... + // gap // ......................... + // gap // ......................... + mls v18.4S, v14.4S, v29.4S // .....*................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + mls v26.4S, v17.4S, v29.4S // .......*................. + srshr v15.4S, v24.4S, #23 // ...........*............. + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + mul v13.4S, v5.4S, v30.S[0] // .........*............... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + sqrdmulh v23.4S, v5.4S, v30.S[1] // ..........*.............. + // gap // ......................... + // gap // ......................... + sub v20.4S, v26.4S, v18.4S // ..............*.......... + // gap // ......................... + // gap // ......................... + mls v24.4S, v15.4S, v29.4S // ..................*...... + add v8.4S, v26.4S, v18.4S // ...............*......... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + sqrdmulh v18.4S, v20.4S, v30.S[1] // ................*........ + // gap // ......................... + // gap // ......................... + srshr v21.4S, v8.4S, #23 // ...................*..... + // gap // ......................... + // gap // ......................... + mul v9.4S, v20.4S, v30.S[0] // .................*....... + // gap // ......................... + // gap // ......................... + str q24, [x0], #(16*4) // ......................*.. + // gap // ......................... + // gap // ......................... + mls v13.4S, v23.4S, v29.4S // ............*............ + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + mls v9.4S, v18.4S, v29.4S // ....................*.... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + mls v8.4S, v21.4S, v29.4S // .....................*... + // gap // ......................... + // gap // ......................... + str q13, [x0, #-32] // .............*........... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + str q9, [x0, #-16] // .......................*. + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + str q8, [x0, #-48] // ........................* + // gap // ......................... + // gap // ......................... + + // original source code + // sub v26.4S, v13.4S, v27.4S // *........................ + // add v8.4S, v24.4S, v15.4S // ..*...................... + // sqrdmulh v10.4S, v19.4S, v7.S[3] // .*....................... + // sqrdmulh v22.4S, v26.4S, v7.S[1] // ...*..................... + // mul v26.4S, v26.4S, v7.S[0] // ....*.................... + // mls v18.4S, v10.4S, v29.4S // .......*................. + // sub v24.4S, v11.4S, v8.4S // .....*................... + // mls v26.4S, v22.4S, v29.4S // ........*................ + // add v3.4S, v11.4S, v8.4S // ......*.................. + // mul v0.4S, v24.4S, v30.S[0] // ..........*.............. + // sqrdmulh v15.4S, v24.4S, v30.S[1] // ...........*............. + // srshr v21.4S, v3.4S, #23 // .........*............... + // mls v0.4S, v15.4S, v29.4S // ...................*..... + // str q0, [x0, #32] // ......................*.. + // sub v11.4S, v26.4S, v18.4S // ............*............ + // add v10.4S, v26.4S, v18.4S // ..............*.......... + // sqrdmulh v8.4S, v11.4S, v30.S[1] // ...............*......... + // mul v1.4S, v11.4S, v30.S[0] // .................*....... + // mls v3.4S, v21.4S, v29.4S // .............*........... + // srshr v22.4S, v10.4S, #23 // ................*........ + // mls v1.4S, v8.4S, v29.4S // ....................*.... + // mls v10.4S, v22.4S, v29.4S // .....................*... + // str q3, [x0], #(16*4) // ..................*...... + // str q1, [x0, #-16] // .......................*. + // str q10, [x0, #-48] // ........................* + + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 + ldr q18, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + ldr q10, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + ldr q22, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q27, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + ldr q13, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + ldr q15, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + ldr q24, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + ldr q8, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + sub v11.4S, v18.4S, v10.4S // .........................................*.............................................................................................................................................................................................................................................. + add v18.4S, v18.4S, v10.4S // ..........................................*............................................................................................................................................................................................................................................. + ldr q10, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + ldr q23, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + sub v9.4S, v22.4S, v13.4S // ...............................*........................................................................................................................................................................................................................................................ + add v16.4S, v22.4S, v13.4S // ................................*....................................................................................................................................................................................................................................................... + ldr q20, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + ldr q22, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + sqrdmulh v17.4S, v11.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... + sub v28.4S, v24.4S, v8.4S // ................*....................................................................................................................................................................................................................................................................... + add v24.4S, v24.4S, v8.4S // .................*...................................................................................................................................................................................................................................................................... + mul v19.4S, v11.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ + sub v8.4S, v23.4S, v27.4S // ....................................*................................................................................................................................................................................................................................................... + add v21.4S, v23.4S, v27.4S // .....................................*.................................................................................................................................................................................................................................................. + add v27.4S, v22.4S, v15.4S // ......................*................................................................................................................................................................................................................................................................. + sqrdmulh v11.4S, v9.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... + sub v12.4S, v22.4S, v15.4S // .....................*.................................................................................................................................................................................................................................................................. + mul v13.4S, v9.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... + sub v23.4S, v20.4S, v10.4S // ..........................*............................................................................................................................................................................................................................................................. + add v15.4S, v20.4S, v10.4S // ...........................*............................................................................................................................................................................................................................................................ + sqrdmulh v20.4S, v23.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... + add v10.4S, v24.4S, v27.4S // .........................................................*.............................................................................................................................................................................................................................. + sub v24.4S, v24.4S, v27.4S // ........................................................*............................................................................................................................................................................................................................... + mul v23.4S, v23.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... + add v27.4S, v15.4S, v16.4S // ...................................................................*.................................................................................................................................................................................................................... + sub v22.4S, v15.4S, v16.4S // ..................................................................*..................................................................................................................................................................................................................... + mls v19.4S, v17.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + mls v13.4S, v11.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + sqrdmulh v11.4S, v8.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + mul v8.4S, v8.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. + mul v14.4S, v22.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... + sqrdmulh v15.4S, v22.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. + sqrdmulh v17.4S, v12.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... + mul v22.4S, v12.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ + sub v12.4S, v10.4S, v27.4S // ................................................................................................*....................................................................................................................................................................................... + add v27.4S, v10.4S, v27.4S // .................................................................................................*...................................................................................................................................................................................... + sqrdmulh v16.4S, v28.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... + mul v9.4S, v28.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... + add v28.4S, v21.4S, v18.4S // .............................................................................*.......................................................................................................................................................................................................... + sub v21.4S, v21.4S, v18.4S // ............................................................................*........................................................................................................................................................................................................... + mls v23.4S, v20.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + sqrdmulh v18.4S, v24.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ + mul v10.4S, v24.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. + ldr q24, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + sub v20.4S, v23.4S, v13.4S // .......................................................................*................................................................................................................................................................................................................ + add v13.4S, v23.4S, v13.4S // ........................................................................*............................................................................................................................................................................................................... + mls v8.4S, v11.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + ldr q11, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + ldr q23, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + mls v22.4S, v17.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + add v17.4S, v8.4S, v19.4S // ..................................................................................*..................................................................................................................................................................................................... + sub v19.4S, v8.4S, v19.4S // .................................................................................*...................................................................................................................................................................................................... + ldr q8, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + mls v9.4S, v16.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + mls v14.4S, v15.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + add v15.4S, v24.4S, v23.4S // ...............................................*........................................................................................................................................................................................................................................ + sub v23.4S, v24.4S, v23.4S // ..............................................*......................................................................................................................................................................................................................................... + mls v10.4S, v18.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + sub v16.4S, v11.4S, v8.4S // ...................................................*.................................................................................................................................................................................................................................... + add v8.4S, v11.4S, v8.4S // ....................................................*................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v23.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... + add v18.4S, v9.4S, v22.4S // ..............................................................*......................................................................................................................................................................................................................... + sub v11.4S, v9.4S, v22.4S // .............................................................*.......................................................................................................................................................................................................................... + mul v22.4S, v23.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... + sqrdmulh v9.4S, v16.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. + mls v22.4S, v24.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + mul v16.4S, v16.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. + mls v16.4S, v9.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + sqrdmulh v24.4S, v12.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... + mul v12.4S, v12.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... + add v23.4S, v22.4S, v16.4S // ............................................................................................*........................................................................................................................................................................................... + sub v9.4S, v22.4S, v16.4S // ...........................................................................................*............................................................................................................................................................................................ + sqrdmulh v16.4S, v19.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + mul v19.4S, v19.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + add v22.4S, v17.4S, v23.4S // ..........................................................................................................................*............................................................................................................................................................. + sub v23.4S, v17.4S, v23.4S // .........................................................................................................................*.............................................................................................................................................................. + mls v12.4S, v24.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + add v24.4S, v15.4S, v8.4S // .......................................................................................*................................................................................................................................................................................................ + sub v17.4S, v18.4S, v13.4S // .....................................................................................................*.................................................................................................................................................................................. + add v13.4S, v18.4S, v13.4S // ......................................................................................................*................................................................................................................................................................................. + mls v19.4S, v16.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + add v18.4S, v28.4S, v24.4S // .....................................................................................................................*.................................................................................................................................................................. + sub v16.4S, v28.4S, v24.4S // ....................................................................................................................*................................................................................................................................................................... + mul v24.4S, v9.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... + sub v28.4S, v15.4S, v8.4S // ......................................................................................*................................................................................................................................................................................................. + sqrdmulh v8.4S, v9.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... + add v9.4S, v13.4S, v22.4S // ..............................................................................................................................................*......................................................................................................................................... + sub v22.4S, v13.4S, v22.4S // .............................................................................................................................................*.......................................................................................................................................... + mul v13.4S, v28.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... + sqrdmulh v28.4S, v28.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. + mls v24.4S, v8.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + sqrdmulh v8.4S, v21.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + mul v15.4S, v21.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... + sqrdmulh v21.4S, v11.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... + mls v15.4S, v8.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + add v8.4S, v27.4S, v18.4S // .........................................................................................................................................*.............................................................................................................................................. + sub v18.4S, v27.4S, v18.4S // ........................................................................................................................................*............................................................................................................................................... + mul v27.4S, v11.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ + mul v11.4S, v17.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ + sqrdmulh v17.4S, v17.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... + mls v13.4S, v28.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + mls v27.4S, v21.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + sqrdmulh v21.4S, v20.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. + add v28.4S, v15.4S, v13.4S // ...............................................................................................................................*........................................................................................................................................................ + sub v13.4S, v15.4S, v13.4S // ..............................................................................................................................*......................................................................................................................................................... + mul v15.4S, v22.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + sqrdmulh v22.4S, v22.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + mls v11.4S, v17.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + sub v17.4S, v19.4S, v24.4S // ...................................................................................................................................*.................................................................................................................................................... + add v19.4S, v19.4S, v24.4S // ....................................................................................................................................*................................................................................................................................................... + sqrdmulh v24.4S, v16.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ + mls v15.4S, v22.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + mul v16.4S, v16.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. + mls v16.4S, v24.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + mul v24.4S, v20.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. + cmge v20.4S, v31.4S, v15.4S // ....................................................................................................................................................................................*................................................................................................... + mls v24.4S, v21.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + cmge v21.4S, v15.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + mul v22.4S, v13.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + sub v21.4S, v20.4S, v21.4S // ......................................................................................................................................................................................*................................................................................................. + sqrdmulh v20.4S, v13.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + sub v13.4S, v27.4S, v24.4S // ...............................................................................................................*........................................................................................................................................................................ + add v27.4S, v27.4S, v24.4S // ................................................................................................................*....................................................................................................................................................................... + sqrdmulh v24.4S, v23.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + mls v15.4S, v21.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sub v21.4S, v10.4S, v14.4S // ..........................................................................................................*............................................................................................................................................................................. + mls v22.4S, v20.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + sqrdmulh v20.4S, v13.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + str q15, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + mul v15.4S, v23.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ + mul v23.4S, v13.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + mls v23.4S, v20.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + sub count, count, #1 +layer1234_start: + add v10.4S, v10.4S, v14.4S // ...........................................................................................................*............................................................................................................................................................................ + mls v15.4S, v24.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + sub v20.4S, v12.4S, v16.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v12.4S, v16.4S // .............................................................................................................................................................*.......................................................................................................................... + sqrdmulh v14.4S, v17.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + sub v24.4S, v10.4S, v28.4S // ..................................................................................................................................................*..................................................................................................................................... + add v10.4S, v10.4S, v28.4S // ...................................................................................................................................................*.................................................................................................................................... + mul v17.4S, v17.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + add v13.4S, v11.4S, v15.4S // ..................................................................................................................................................................*..................................................................................................................... + sub v15.4S, v11.4S, v15.4S // .................................................................................................................................................................*...................................................................................................................... + mul v28.4S, v21.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + add v11.4S, v27.4S, v19.4S // ........................................................................................................................................................*............................................................................................................................... + sqrdmulh v21.4S, v21.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + sub v27.4S, v27.4S, v19.4S // .......................................................................................................................................................*................................................................................................................................ + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sqrdmulh v8.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + sqrdmulh v19.4S, v18.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + mul v18.4S, v18.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + mls v28.4S, v21.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + mls v17.4S, v14.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mls v18.4S, v19.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + sub v19.4S, v28.4S, v22.4S // ......................................................................................................................................................................*................................................................................................................. + add v14.4S, v28.4S, v22.4S // .......................................................................................................................................................................*................................................................................................................ + sqrdmulh v21.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + mul v10.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + cmge v28.4S, v31.4S, v18.4S // ................................................................................................................................................................................*....................................................................................................... + mls v16.4S, v8.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + cmge v8.4S, v18.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + sub v8.4S, v28.4S, v8.4S // ..................................................................................................................................................................................*..................................................................................................... + sqrdmulh v22.4S, v27.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mls v10.4S, v21.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + mls v18.4S, v8.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + mul v21.4S, v20.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + cmge v28.4S, v31.4S, v10.4S // ........................................................................................................................................................................................................................................................*............................... + sqrdmulh v20.4S, v20.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + cmge v8.4S, v10.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + str q18, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + mul v18.4S, v15.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + sub v28.4S, v28.4S, v8.4S // ..........................................................................................................................................................................................................................................................*............................. + sqrdmulh v15.4S, v15.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + mul v8.4S, v19.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + sqrdmulh v19.4S, v19.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mls v10.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + cmge v28.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + mls v18.4S, v15.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v15.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + mls v21.4S, v20.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + str q10, [x1, #128] // ..................................................................................................................................................................................................................................................................................*..... + mul v10.4S, v27.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sub v27.4S, v28.4S, v15.4S // ..................................................................................................................................................................................................................................................*..................................... + cmge v20.4S, v18.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + mls v10.4S, v22.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + cmge v22.4S, v31.4S, v18.4S // ....................................................................................................................................................................................................*................................................................................... + cmge v28.4S, v21.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v16.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + cmge v15.4S, v31.4S, v21.4S // ................................................................................................................................................................................................*....................................................................................... + sub v27.4S, v15.4S, v28.4S // ..................................................................................................................................................................................................*..................................................................................... + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + sqrdmulh v15.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + sub v16.4S, v22.4S, v20.4S // ......................................................................................................................................................................................................*................................................................................. + mls v8.4S, v19.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + mls v18.4S, v16.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + sqrdmulh v28.4S, v24.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + str q18, [x1, #816] // .....................................................................................................................................................................................................................*.................................................................. + sqrdmulh v18.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + cmge v14.4S, v31.4S, v10.4S // ............................................................................................................................................................................................*........................................................................................... + mul v24.4S, v24.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + mls v21.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + mls v24.4S, v28.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + cmge v28.4S, v10.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + mls v19.4S, v15.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + sub v15.4S, v14.4S, v28.4S // ..............................................................................................................................................................................................*......................................................................................... + str q21, [x1, #752] // ....................................................................................................................................................................................................................*................................................................... + mls v22.4S, v18.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + sub v18.4S, v23.4S, v17.4S // ...........................................................................................................................................................................*............................................................................................................ + cmge v14.4S, v31.4S, v8.4S // ........................................................................................................................................................................................................*............................................................................... + mls v10.4S, v15.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + add v15.4S, v23.4S, v17.4S // ............................................................................................................................................................................*........................................................................................................... + sqrdmulh v21.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + cmge v9.4S, v8.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + str q10, [x1, #688] // ...................................................................................................................................................................................................................*.................................................................... + mls v17.4S, v21.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + sub v28.4S, v14.4S, v9.4S // ..........................................................................................................................................................................................................*............................................................................. + ldr q9, [x1, #64] // .e...................................................................................................................................................................................................................................................................................... + cmge v20.4S, v24.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + ldr q27, [x1, #0] // e....................................................................................................................................................................................................................................................................................... + cmge v16.4S, v31.4S, v24.4S // ........................................................................................................................................................................................*............................................................................................... + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + cmge v11.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + sqrdmulh v13.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + cmge v10.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + cmge v23.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + ldr q14, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ + mls v8.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + cmge v28.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + sub v16.4S, v16.4S, v20.4S // ..........................................................................................................................................................................................*............................................................................................. + sqrdmulh v20.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + sub v11.4S, v11.4S, v10.4S // ..............................................................................................................................................................................................................................................................*......................... + sub v28.4S, v23.4S, v28.4S // ......................................................................................................................................................................................................................................................*................................. + mls v21.4S, v13.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v13.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + str q8, [x1, #880] // ......................................................................................................................................................................................................................*................................................................. + cmge v10.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + mls v17.4S, v28.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + sub v28.4S, v27.4S, v9.4S // ................e....................................................................................................................................................................................................................................................................... + mul v15.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sub v23.4S, v10.4S, v13.4S // ..........................................................................................................................................................................................................................................................................*............. + mls v15.4S, v20.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + cmge v8.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + sqrdmulh v10.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + add v20.4S, v27.4S, v9.4S // .................e...................................................................................................................................................................................................................................................................... + ldr q27, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... + ldr q9, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... + str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + cmge v17.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................................*........... + mul v12.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + mls v24.4S, v16.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + cmge v13.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + mls v12.4S, v10.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + ldr q10, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. + sub v16.4S, v9.4S, v27.4S // .....................e.................................................................................................................................................................................................................................................................. + add v9.4S, v9.4S, v27.4S // ......................e................................................................................................................................................................................................................................................................. + sub v13.4S, v13.4S, v8.4S // ......................................................................................................................................................................................................................................................................*................. + mls v22.4S, v23.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + str q24, [x1, #624] // ..................................................................................................................................................................................................................*..................................................................... + cmge v27.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + sqrdmulh v24.4S, v18.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + cmge v8.4S, v31.4S, v12.4S // ................................................................................................................................................................................................................................................................*....................... + mls v19.4S, v11.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + cmge v23.4S, v12.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + sub v17.4S, v17.4S, v27.4S // ..............................................................................................................................................................................................................................................................................*......... + sub v11.4S, v10.4S, v14.4S // .........................................e.............................................................................................................................................................................................................................................. + mul v27.4S, v18.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sub v8.4S, v8.4S, v23.4S // ..................................................................................................................................................................................................................................................................*..................... + ldr q23, [x1, #320] // .....e.................................................................................................................................................................................................................................................................................. + mls v27.4S, v24.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + mls v12.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + add v18.4S, v10.4S, v14.4S // ..........................................e............................................................................................................................................................................................................................................. + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + ldr q19, [x1, #384] // ......e................................................................................................................................................................................................................................................................................. + cmge v10.4S, v31.4S, v27.4S // ............................................................................................................................................................................................................*........................................................................... + mls v15.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + cmge v17.4S, v27.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + ldr q22, [x1, #448] // .......e................................................................................................................................................................................................................................................................................ + sub v24.4S, v20.4S, v9.4S // ........................................................e............................................................................................................................................................................................................................... + mls v21.4S, v13.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + ldr q13, [x1, #512] // ........e............................................................................................................................................................................................................................................................................... + add v14.4S, v20.4S, v9.4S // .........................................................e.............................................................................................................................................................................................................................. + str q12, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + sub v12.4S, v10.4S, v17.4S // ..............................................................................................................................................................................................................*......................................................................... + sqrdmulh v8.4S, v11.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... + add v10.4S, v19.4S, v22.4S // ................................e....................................................................................................................................................................................................................................................... + sub v17.4S, v19.4S, v22.4S // ...............................e........................................................................................................................................................................................................................................................ + str q15, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + ldr q22, [x1, #256] // ....e................................................................................................................................................................................................................................................................................... + mls v27.4S, v12.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + mul v19.4S, v11.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ + sub v15.4S, v22.4S, v23.4S // ..........................e............................................................................................................................................................................................................................................................. + mul v12.4S, v28.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... + str q27, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ + sqrdmulh v28.4S, v28.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... + sqrdmulh v9.4S, v16.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... + mul v27.4S, v16.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ + mls v12.4S, v28.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... + add v11.4S, v22.4S, v23.4S // ...........................e............................................................................................................................................................................................................................................................ + ldr q16, [x1, #576] // .........e.............................................................................................................................................................................................................................................................................. + mls v27.4S, v9.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. + add v22.4S, v13.4S, v16.4S // .....................................e.................................................................................................................................................................................................................................................. + mls v19.4S, v8.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... + sub v28.4S, v13.4S, v16.4S // ....................................e................................................................................................................................................................................................................................................... + sqrdmulh v8.4S, v17.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... + sub v23.4S, v11.4S, v10.4S // ..................................................................e..................................................................................................................................................................................................................... + add v20.4S, v12.4S, v27.4S // ..............................................................e......................................................................................................................................................................................................................... + sub v12.4S, v12.4S, v27.4S // .............................................................e.......................................................................................................................................................................................................................... + sqrdmulh v21.4S, v15.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... + mul v9.4S, v15.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... + add v10.4S, v11.4S, v10.4S // ...................................................................e.................................................................................................................................................................................................................... + sqrdmulh v15.4S, v28.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ + add v11.4S, v14.4S, v10.4S // .................................................................................................e...................................................................................................................................................................................... + sub v14.4S, v14.4S, v10.4S // ................................................................................................e....................................................................................................................................................................................... + mul v10.4S, v17.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... + mls v9.4S, v21.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... + mls v10.4S, v8.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... + mul v17.4S, v28.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. + sqrdmulh v13.4S, v12.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... + add v28.4S, v22.4S, v18.4S // .............................................................................e.......................................................................................................................................................................................................... + mls v17.4S, v15.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... + sub v8.4S, v9.4S, v10.4S // .......................................................................e................................................................................................................................................................................................................ + mul v15.4S, v12.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ + sqrdmulh v21.4S, v23.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. + sqrdmulh v16.4S, v14.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... + mul v12.4S, v14.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... + sub v22.4S, v22.4S, v18.4S // ............................................................................e........................................................................................................................................................................................................... + mul v14.4S, v23.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... + add v23.4S, v9.4S, v10.4S // ........................................................................e............................................................................................................................................................................................................... + ldr q10, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ + ldr q9, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... + mls v14.4S, v21.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. + ldr q21, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... + mls v12.4S, v16.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... + mls v15.4S, v13.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... + sub v27.4S, v21.4S, v9.4S // ..............................................e......................................................................................................................................................................................................................................... + sqrdmulh v16.4S, v8.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. + mul v13.4S, v8.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. + sqrdmulh v18.4S, v27.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... + ldr q8, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... + mls v13.4S, v16.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ + add v16.4S, v21.4S, v9.4S // ...............................................e........................................................................................................................................................................................................................................ + mul v27.4S, v27.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... + sub v21.4S, v8.4S, v10.4S // ...................................................e.................................................................................................................................................................................................................................... + add v8.4S, v8.4S, v10.4S // ....................................................e................................................................................................................................................................................................................................... + mls v27.4S, v18.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... + add v18.4S, v16.4S, v8.4S // .......................................................................................e................................................................................................................................................................................................ + sqrdmulh v9.4S, v22.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ + sub v10.4S, v16.4S, v8.4S // ......................................................................................e................................................................................................................................................................................................. + mul v8.4S, v22.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... + sub v16.4S, v28.4S, v18.4S // ....................................................................................................................e................................................................................................................................................................... + add v18.4S, v28.4S, v18.4S // .....................................................................................................................e.................................................................................................................................................................. + sqrdmulh v28.4S, v21.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. + mul v21.4S, v21.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. + sub v22.4S, v20.4S, v23.4S // .....................................................................................................e.................................................................................................................................................................................. + add v20.4S, v20.4S, v23.4S // ......................................................................................................e................................................................................................................................................................................. + add v23.4S, v17.4S, v19.4S // ..................................................................................e..................................................................................................................................................................................................... + mls v21.4S, v28.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ + sub v17.4S, v17.4S, v19.4S // .................................................................................e...................................................................................................................................................................................................... + sqrdmulh v19.4S, v10.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. + mls v8.4S, v9.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... + add v28.4S, v27.4S, v21.4S // ............................................................................................e........................................................................................................................................................................................... + sub v27.4S, v27.4S, v21.4S // ...........................................................................................e............................................................................................................................................................................................ + mul v10.4S, v10.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... + sub v21.4S, v15.4S, v13.4S // ...............................................................................................................e........................................................................................................................................................................ + mls v10.4S, v19.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. + add v19.4S, v23.4S, v28.4S // ..........................................................................................................................e............................................................................................................................................................. + sub v23.4S, v23.4S, v28.4S // .........................................................................................................................e.............................................................................................................................................................. + sqrdmulh v28.4S, v27.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... + add v9.4S, v20.4S, v19.4S // ..............................................................................................................................................e......................................................................................................................................... + sub v19.4S, v20.4S, v19.4S // .............................................................................................................................................e.......................................................................................................................................... + mul v20.4S, v27.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... + add v27.4S, v15.4S, v13.4S // ................................................................................................................e....................................................................................................................................................................... + sqrdmulh v15.4S, v16.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ + mul v16.4S, v16.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. + sub v13.4S, v8.4S, v10.4S // ..............................................................................................................................e......................................................................................................................................................... + mls v20.4S, v28.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ + add v28.4S, v8.4S, v10.4S // ...............................................................................................................................e........................................................................................................................................................ + add v8.4S, v11.4S, v18.4S // .........................................................................................................................................e.............................................................................................................................................. + sub v18.4S, v11.4S, v18.4S // ........................................................................................................................................e............................................................................................................................................... + mul v10.4S, v24.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. + mul v11.4S, v19.4S, v0.S[0] // ...............................................................................................................................................e........................................................................................................................................ + sqrdmulh v19.4S, v19.4S, v0.S[1] // ................................................................................................................................................e....................................................................................................................................... + mls v16.4S, v15.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... + sqrdmulh v15.4S, v24.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ + mls v11.4S, v19.4S, v29.4S // .................................................................................................................................................e...................................................................................................................................... + sqrdmulh v19.4S, v17.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... + mls v10.4S, v15.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... + cmge v24.4S, v31.4S, v11.4S // ....................................................................................................................................................................................e................................................................................................... + mul v15.4S, v17.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... + cmge v17.4S, v11.4S, v30.4S // .....................................................................................................................................................................................e.................................................................................................. + mls v15.4S, v19.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. + sub v17.4S, v24.4S, v17.4S // ......................................................................................................................................................................................e................................................................................................. + sqrdmulh v24.4S, v22.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... + mls v11.4S, v17.4S, v29.4S // .......................................................................................................................................................................................e................................................................................................ + add v19.4S, v15.4S, v20.4S // ....................................................................................................................................e................................................................................................................................................... + sub v17.4S, v15.4S, v20.4S // ...................................................................................................................................e.................................................................................................................................................... + mul v15.4S, v23.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + sqrdmulh v20.4S, v21.4S, v0.S[3] // ..................................................................................................................e..................................................................................................................................................................... + str q11, [x1, #576] // .................................................................................................................................................................................................................e...................................................................... + mul v11.4S, v22.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ + mul v22.4S, v13.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... + sqrdmulh v13.4S, v13.4S, v1.S[1] // .................................................................................................................................e...................................................................................................................................................... + mls v11.4S, v24.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. + sqrdmulh v24.4S, v23.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... + mul v23.4S, v21.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... + mls v23.4S, v20.4S, v29.4S // ...................................................................................................................e.................................................................................................................................................................... + mls v22.4S, v13.4S, v29.4S // ..................................................................................................................................e..................................................................................................................................................... + sub v21.4S, v10.4S, v14.4S // ..........................................................................................................e............................................................................................................................................................................. + + // original source code + // ldr q8, [x1, #0] // ..e...............................................................................................................................................................................................|.......................................................................................e............................................................................ + // ldr q9, [x1, #(1*(512/8))] // e.................................................................................................................................................................................................|.....................................................................................e.............................................................................. + // ldr q10, [x1, #(2*(512/8))] // .............................e....................................................................................................................................................................|..................................................................................................................e................................................. + // ldr q11, [x1, #(3*(512/8))] // ............................e.....................................................................................................................................................................|.................................................................................................................e.................................................. + // ldr q12, [x1, #(4*(512/8))] // ........................................................................e.........................................................................................................................|.............................................................................................................................................................e...... + // ldr q13, [x1, #(5*(512/8))] // ....................................................e.............................................................................................................................................|.........................................................................................................................................e.......................... + // ldr q14, [x1, #(6*(512/8))] // .........................................................e........................................................................................................................................|..............................................................................................................................................e..................... + // ldr q15, [x1, #(7*(512/8))] // .............................................................e....................................................................................................................................|..................................................................................................................................................e................. + // ldr q16, [x1, #(8*(512/8))] // ................................................................e.................................................................................................................................|.....................................................................................................................................................e.............. + // ldr q17, [x1, #(9*(512/8))] // ....................................................................................e.............................................................................................................|.................................................................................................................................................................... + // ldr q18, [x1, #(10*(512/8))] // ....................................e.............................................................................................................................................................|.........................................................................................................................e.......................................... + // ldr q19, [x1, #(11*(512/8))] // .........e........................................................................................................................................................................................|..............................................................................................e..................................................................... + // ldr q20, [x1, #(12*(512/8))] // .....................................................................................................................e............................................................................|.................................................................................................................................................................... + // ldr q21, [x1, #(13*(512/8))] // ...................................................................................................................e..............................................................................|.................................................................................................................................................................... + // ldr q22, [x1, #(14*(512/8))] // ............................................................................................................................e.....................................................................|.................................................................................................................................................................... + // ldr q23, [x1, #(15*(512/8))] // ..................................................................................................................e...............................................................................|.................................................................................................................................................................... + // sub v24.4s, v8.4s, v9.4s // .....................e............................................................................................................................................................................|..........................................................................................................e......................................................... + // add v8.4s, v8.4s, v9.4s // ...........................e......................................................................................................................................................................|................................................................................................................e................................................... + // mul v9.4s, v24.4s, v3.s[2] // .............................................................................e....................................................................................................................|..................................................................................................................................................................e. + // sqrdmulh v24.4s, v24.4s, v3.s[3] // ...............................................................................e..................................................................................................................|.................................................................................................................................................................... + // mls v9.4s, v24.4s, v29.4s // ..................................................................................e...............................................................................................................|.................................................................................................................................................................... + // sub v24.4s, v10.4s, v11.4s // .....................................e............................................................................................................................................................|..........................................................................................................................e......................................... + // add v10.4s, v10.4s, v11.4s // ......................................e...........................................................................................................................................................|...........................................................................................................................e........................................ + // mul v11.4s, v24.4s, v4.s[0] // .................................................................................e................................................................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.s[1] // ................................................................................e.................................................................................................................|.................................................................................................................................................................... + // mls v11.4s, v24.4s, v29.4s // .....................................................................................e............................................................................................................|.................................................................................................................................................................... + // sub v24.4s, v12.4s, v13.4s // ............................................................................e.....................................................................................................................|.................................................................................................................................................................e.. + // add v12.4s, v12.4s, v13.4s // ...................................................................................e..............................................................................................................|.................................................................................................................................................................... + // mul v13.4s, v24.4s, v4.s[2] // ..............................................................................................e...................................................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.s[3] // .............................................................................................e....................................................................................................|.................................................................................................................................................................... + // mls v13.4s, v24.4s, v29.4s // ....................................................................................................e.............................................................................................|.................................................................................................................................................................... + // sub v24.4s, v14.4s, v15.4s // ......................................................................e...........................................................................................................................|...........................................................................................................................................................e........ + // add v14.4s, v14.4s, v15.4s // .....................................................................e............................................................................................................................|..........................................................................................................................................................e......... + // mul v15.4s, v24.4s, v5.s[0] // ...................................................................................................e..............................................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.s[1] // .........................................................................................e........................................................................................................|.................................................................................................................................................................... + // mls v15.4s, v24.4s, v29.4s // .....................................................................................................e............................................................................................|.................................................................................................................................................................... + // sub v24.4s, v16.4s, v17.4s // ........................................................................................e.........................................................................................................|.................................................................................................................................................................... + // add v16.4s, v16.4s, v17.4s // ......................................................................................e...........................................................................................................|.................................................................................................................................................................... + // mul v17.4s, v24.4s, v5.s[2] // ......................................................................................................e...........................................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.s[3] // ................................................................................................e.................................................................................................|.................................................................................................................................................................... + // mls v17.4s, v24.4s, v29.4s // .........................................................................................................e........................................................................................|.................................................................................................................................................................... + // sub v24.4s, v18.4s, v19.4s // .................................................e................................................................................................................................................|......................................................................................................................................e............................. + // add v18.4s, v18.4s, v19.4s // .......................................................e..........................................................................................................................................|............................................................................................................................................e....................... + // mul v19.4s, v24.4s, v6.s[0] // ...........................................................................e......................................................................................................................|................................................................................................................................................................e... + // sqrdmulh v24.4s, v24.4s, v6.s[1] // ....................................................................e.............................................................................................................................|.........................................................................................................................................................e.......... + // mls v19.4s, v24.4s, v29.4s // .......................................................................................e..........................................................................................................|.................................................................................................................................................................... + // sub v24.4s, v20.4s, v21.4s // ........................................................................................................................e.........................................................................|.................................................................................................................................................................... + // add v20.4s, v20.4s, v21.4s // ..............................................................................................................................e...................................................................|.................................................................................................................................................................... + // mul v21.4s, v24.4s, v6.s[2] // ...............................................................................................................................e..................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v6.s[3] // ...........................................................................................................................e......................................................................|.................................................................................................................................................................... + // mls v21.4s, v24.4s, v29.4s // ..................................................................................................................................e...............................................................|.................................................................................................................................................................... + // sub v24.4s, v22.4s, v23.4s // ................................................................................................................................e.................................................................|.................................................................................................................................................................... + // add v22.4s, v22.4s, v23.4s // .................................................................................................................................e................................................................|.................................................................................................................................................................... + // mul v23.4s, v24.4s, v7.s[0] // ..........................................................................................................................................e.......................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v7.s[1] // .........................................................................................................................................e........................................................|.................................................................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // ..............................................................................................................................................e...................................................|.................................................................................................................................................................... + // sub v24.4s, v8.4s, v10.4s // ..............................................................e...................................................................................................................................|...................................................................................................................................................e................ + // add v8.4s, v8.4s, v10.4s // .................................................................e................................................................................................................................|......................................................................................................................................................e............. + // mul v10.4s, v24.4s, v1.s[2] // .....................................................................................................................................................................e............................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .........................................................................................................................................................................e........................|.................................................................................................................................................................... + // mls v10.4s, v24.4s, v29.4s // ............................................................................................................................................................................e.....................|.................................................................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ............................................................................................e.....................................................................................................|.................................................................................................................................................................... + // add v9.4s, v9.4s, v11.4s // ...........................................................................................e......................................................................................................|.................................................................................................................................................................... + // mul v11.4s, v24.4s, v1.s[2] // ...........................................................................................................e......................................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .......................................................................................................e..........................................................................................|.................................................................................................................................................................... + // mls v11.4s, v24.4s, v29.4s // .......................................................................................................................e..........................................................................|.................................................................................................................................................................... + // sub v24.4s, v12.4s, v14.4s // ..........................................................................................e.......................................................................................................|.................................................................................................................................................................... + // add v12.4s, v12.4s, v14.4s // ...............................................................................................e..................................................................................................|.................................................................................................................................................................... + // mul v14.4s, v24.4s, v2.s[0] // ................................................................................................................e.................................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ............................................................................................................e.....................................................................................|.................................................................................................................................................................... + // mls v14.4s, v24.4s, v29.4s // ....................................................................................................................e.............................................................................|.................................................................................................................................................................... + // sub v24.4s, v13.4s, v15.4s // ..........................................................................................................e.......................................................................................|.................................................................................................................................................................... + // add v13.4s, v13.4s, v15.4s // .................................................................................................................e................................................................................|.................................................................................................................................................................... + // mul v15.4s, v24.4s, v2.s[0] // ..........................................................................................................................e.......................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .........................................................................................................................e........................................................................|.................................................................................................................................................................... + // mls v15.4s, v24.4s, v29.4s // .............................................................................................................................e....................................................................|.................................................................................................................................................................... + // sub v24.4s, v16.4s, v18.4s // ...............................................................................................................e..................................................................................|.................................................................................................................................................................... + // add v16.4s, v16.4s, v18.4s // ........................................................................................................e.........................................................................................|.................................................................................................................................................................... + // mul v18.4s, v24.4s, v2.s[2] // ......................................................................................................................................e...........................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ....................................................................................................................................e.............................................................|.................................................................................................................................................................... + // mls v18.4s, v24.4s, v29.4s // .................................................................................................................................................e................................................|.................................................................................................................................................................... + // sub v24.4s, v17.4s, v19.4s // ...............................................................................................................................................e..................................................|.................................................................................................................................................................... + // add v17.4s, v17.4s, v19.4s // .............................................................................................................................................e....................................................|.................................................................................................................................................................... + // mul v19.4s, v24.4s, v2.s[2] // ..............................................................................................................................................................................e...................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...........................................................................................................................................................................e......................|.................................................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // ................................................................................................................................................................................e.................|.................................................................................................................................................................... + // sub v24.4s, v20.4s, v22.4s // .....................................................................................................................................e............................................................|.................................................................................................................................................................... + // add v20.4s, v20.4s, v22.4s // ...................................................................................................................................e..............................................................|.................................................................................................................................................................... + // mul v22.4s, v24.4s, v3.s[0] // ....................................................................................................................................................e.............................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................................................................................................................................................e.................................................|.................................................................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ......................................................................................................................................................e...........................................|.................................................................................................................................................................... + // sub v24.4s, v21.4s, v23.4s // ...................................................................................................................................................e..............................................|.................................................................................................................................................................... + // add v21.4s, v21.4s, v23.4s // ..................................................................................................................................................e...............................................|.................................................................................................................................................................... + // mul v23.4s, v24.4s, v3.s[0] // ............................................................................................................................................................e.....................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .........................................................................................................................................................e........................................|.................................................................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // .................................................................................................................................................................e................................|.................................................................................................................................................................... + // sub v24.4s, v8.4s, v12.4s // ..................................................................................................e...............................................................................................|.................................................................................................................................................................... + // add v8.4s, v8.4s, v12.4s // .................................................................................................e................................................................................................|.................................................................................................................................................................... + // mul v12.4s, v24.4s, v0.s[2] // ..............................................................................................................e...................................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................................................................................e....................................................................................|.................................................................................................................................................................... + // mls v12.4s, v24.4s, v29.4s // ......................................................................................................................e...........................................................................|.................................................................................................................................................................... + // sub v24.4s, v9.4s, v13.4s // ...........................................................................................................................................e......................................................|.................................................................................................................................................................... + // add v9.4s, v9.4s, v13.4s // ............................................................................................................................................e.....................................................|.................................................................................................................................................................... + // mul v13.4s, v24.4s, v0.s[2] // .........................................................................................................................................................................................e........|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................................................................................................................................................................e...............|.................................................................................................................................................................... + // mls v13.4s, v24.4s, v29.4s // ............................................................................................................................................................................................e.....|.................................................................................................................................................................... + // sub v24.4s, v10.4s, v14.4s // .................................................................................................................................................................................................e|.................................................................................................................................................................... + // add v10.4s, v10.4s, v14.4s // ..................................................................................................................................................................................................*.................................................................................................................................................................... + // mul v14.4s, v24.4s, v0.s[2] // ..................................................................................................................................................................................................|.........*.......................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................................................................................................................................................................................|...........*........................................................................................................................................................ + // mls v14.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|.................*.................................................................................................................................................. + // sub v24.4s, v11.4s, v15.4s // .....................................................................................................................................................e............................................|.................................................................................................................................................................... + // add v11.4s, v11.4s, v15.4s // .............................................................................................................................................................e....................................|.................................................................................................................................................................... + // mul v15.4s, v24.4s, v0.s[2] // ..............................................................................................................................................................................................e...|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................................................................................e..........|.................................................................................................................................................................... + // mls v15.4s, v24.4s, v29.4s // ...............................................................................................................................................................................................e..|.................................................................................................................................................................... + // sub v24.4s, v16.4s, v20.4s // .......................................................................................................................................e..........................................................|.................................................................................................................................................................... + // add v16.4s, v16.4s, v20.4s // ........................................................................................................................................e.........................................................|.................................................................................................................................................................... + // mul v20.4s, v24.4s, v1.s[0] // ...............................................................................................................................................................e..................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................................................................e...................................|.................................................................................................................................................................... + // mls v20.4s, v24.4s, v29.4s // ........................................................................................................................................................................e.........................|.................................................................................................................................................................... + // sub v24.4s, v17.4s, v21.4s // ........................................................................................................................................................e.........................................|.................................................................................................................................................................... + // add v17.4s, v17.4s, v21.4s // .......................................................................................................................................................e..........................................|.................................................................................................................................................................... + // mul v21.4s, v24.4s, v1.s[0] // ......................................................................................................................................................................................e...........|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................................................................................................................................................................................e....|.................................................................................................................................................................... + // mls v21.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|*................................................................................................................................................................... + // sub v24.4s, v18.4s, v22.4s // ................................................................................................................................................................e.................................|.................................................................................................................................................................... + // add v18.4s, v18.4s, v22.4s // ..................................................................................................................................................................e...............................|.................................................................................................................................................................... + // mul v22.4s, v24.4s, v1.s[0] // ..........................................................................................................................................................................................e.......|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...........................................................................................................................................................................................e......|.................................................................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ................................................................................................................................................................................................e.|.................................................................................................................................................................... + // sub v24.4s, v19.4s, v23.4s // .....................................................................................................................................................................................e............|.................................................................................................................................................................... + // add v19.4s, v19.4s, v23.4s // ....................................................................................................................................................................................e.............|.................................................................................................................................................................... + // mul v23.4s, v24.4s, v1.s[0] // ..................................................................................................................................................................................................|......*............................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..................................................................................................................................................................................................|...*................................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|..................*................................................................................................................................................. + // sub v24.4s, v8.4s, v16.4s // ....................................................................................................................................................................e.............................|.................................................................................................................................................................... + // add v8.4s, v8.4s, v16.4s // ...................................................................................................................................................................e..............................|.................................................................................................................................................................... + // mul v16.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|................*................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|...............*.................................................................................................................................................... + // mls v16.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|...................*................................................................................................................................................ + // sub v24.4s, v9.4s, v17.4s // ...........................................................................................................................................................e......................................|.................................................................................................................................................................... + // add v9.4s, v9.4s, v17.4s // ..........................................................................................................................................................e.......................................|.................................................................................................................................................................... + // mul v17.4s, v24.4s, v0.s[0] // ......................................................................................................................................................................e...........................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................e..........................|.................................................................................................................................................................... + // mls v17.4s, v24.4s, v29.4s // ..........................................................................................................................................................................e.......................|.................................................................................................................................................................... + // sub v24.4s, v10.4s, v18.4s // ..................................................................................................................................................................................................|....*............................................................................................................................................................... + // add v10.4s, v10.4s, v18.4s // ..................................................................................................................................................................................................|.....*.............................................................................................................................................................. + // mul v18.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|...................................................................*................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|..............................................................*..................................................................................................... + // mls v18.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|.....................................................................*.............................................................................................. + // sub v24.4s, v11.4s, v19.4s // ..................................................................................................................................................................................................|............*....................................................................................................................................................... + // add v11.4s, v11.4s, v19.4s // ..................................................................................................................................................................................................|..........*......................................................................................................................................................... + // mul v19.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|...............................................*.................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|............................*....................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|..................................................*................................................................................................................. + // sub v24.4s, v12.4s, v20.4s // ..................................................................................................................................................................................................|.*.................................................................................................................................................................. + // add v12.4s, v12.4s, v20.4s // ..................................................................................................................................................................................................|..*................................................................................................................................................................. + // mul v20.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|...............................*.................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|.................................*.................................................................................................................................. + // mls v20.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|.............................................*...................................................................................................................... + // sub v24.4s, v13.4s, v21.4s // ..................................................................................................................................................................................................|........*........................................................................................................................................................... + // add v13.4s, v13.4s, v21.4s // ..................................................................................................................................................................................................|.......*............................................................................................................................................................ + // mul v21.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|....................................*............................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|......................................*............................................................................................................................. + // mls v21.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|...........................................*........................................................................................................................ + // sub v24.4s, v14.4s, v22.4s // ..................................................................................................................................................................................................|....................*............................................................................................................................................... + // add v14.4s, v14.4s, v22.4s // ..................................................................................................................................................................................................|.....................*.............................................................................................................................................. + // mul v22.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|.......................................*............................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|........................................*........................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|...........................................................*........................................................................................................ + // sub v24.4s, v15.4s, v23.4s // ..................................................................................................................................................................................................|...........................................................................*........................................................................................ + // add v15.4s, v15.4s, v23.4s // ..................................................................................................................................................................................................|..............................................................................*..................................................................................... + // mul v23.4s, v24.4s, v0.s[0] // ..................................................*...............................................................................................................................................|.......................................................................................................................................*............................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................*......................................................................................................................................................|................................................................................................................................*................................... + // mls v23.4s, v24.4s, v29.4s // .....................................................*............................................................................................................................................|..........................................................................................................................................*......................... + // cmge v27.4s, v31.4s, v16.4s // ..................................................................................................................................................................................................|........................*........................................................................................................................................... + // cmge v28.4s, v16.4s, v30.4s // ..................................................................................................................................................................................................|..........................*......................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|...........................*........................................................................................................................................ + // mls v16.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|..............................*..................................................................................................................................... + // cmge v27.4s, v31.4s, v17.4s // .............................................................................................................................................................................e....................|.................................................................................................................................................................... + // cmge v28.4s, v17.4s, v30.4s // ...............................................................................................................................................................................e..................|.................................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .................................................................................................................................................................................e................|.................................................................................................................................................................... + // mls v17.4s, v28.4s, v29.4s // ...................................................................................................................................................................................e..............|.................................................................................................................................................................... + // cmge v27.4s, v31.4s, v18.4s // ...*..............................................................................................................................................................................................|........................................................................................*........................................................................... + // cmge v28.4s, v18.4s, v30.4s // .*................................................................................................................................................................................................|......................................................................................*............................................................................. + // sub v28.4s, v27.4s, v28.4s // ............*.....................................................................................................................................................................................|.................................................................................................*.................................................................. + // mls v18.4s, v28.4s, v29.4s // .................................*................................................................................................................................................................|......................................................................................................................*............................................. + // cmge v27.4s, v31.4s, v19.4s // ..................................................................................................................................................................................................|..................................................................*................................................................................................. + // cmge v28.4s, v19.4s, v30.4s // ..................................................................................................................................................................................................|......................................................................*............................................................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|........................................................................*........................................................................................... + // mls v19.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|.............................................................................*...................................................................................... + // cmge v27.4s, v31.4s, v20.4s // ..................................................................................................................................................................................................|......................................................*............................................................................................................. + // cmge v28.4s, v20.4s, v30.4s // ..................................................................................................................................................................................................|....................................................*............................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|.......................................................*............................................................................................................ + // mls v20.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|....................................................................*............................................................................................... + // cmge v27.4s, v31.4s, v21.4s // ..................................................................................................................................................................................................|...................................................*................................................................................................................ + // cmge v28.4s, v21.4s, v30.4s // ..................................................................................................................................................................................................|.................................................*.................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|..........................................................*......................................................................................................... + // mls v21.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|.............................................................*...................................................................................................... + // cmge v27.4s, v31.4s, v22.4s // ..................................................................................................................................................................................................|............................................................................*....................................................................................... + // cmge v28.4s, v22.4s, v30.4s // ..................................................................................................................................................................................................|.................................................................................*.................................................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|....................................................................................*............................................................................... + // mls v22.4s, v28.4s, v29.4s // ..........*.......................................................................................................................................................................................|...............................................................................................*.................................................................... + // cmge v27.4s, v31.4s, v23.4s // ..........................................................*.......................................................................................................................................|...............................................................................................................................................*.................... + // cmge v28.4s, v23.4s, v30.4s // ............................................................*.....................................................................................................................................|.................................................................................................................................................*.................. + // sub v28.4s, v27.4s, v28.4s // ...................................................................*..............................................................................................................................|........................................................................................................................................................*........... + // mls v23.4s, v28.4s, v29.4s // .........................................................................*........................................................................................................................|..............................................................................................................................................................*..... + // str q16, [x1, #(8*(512/8))] // ..................................................................................................................................................................................................|...................................*................................................................................................................................ + // str q17, [x1, #(9*(512/8))] // ........................................................................................................................................................................................e.........|.................................................................................................................................................................... + // str q18, [x1, #(10*(512/8))] // .........................................*........................................................................................................................................................|..............................................................................................................................*..................................... + // str q19, [x1, #(11*(512/8))] // ..................................................................................................................................................................................................|..................................................................................*................................................................................. + // str q20, [x1, #(12*(512/8))] // ..................................................................................................................................................................................................|.........................................................................*.......................................................................................... + // str q21, [x1, #(13*(512/8))] // ..................................................................................................................................................................................................|................................................................*................................................................................................... + // str q22, [x1, #(14*(512/8))] // ..................*...............................................................................................................................................................................|.......................................................................................................*............................................................ + // str q23, [x1, #(15*(512/8))] // ..............................................................................*...................................................................................................................|...................................................................................................................................................................* + // mul v16.4s, v8.4s, v25.4s // ..................................................................................................................................................................................................|.............*...................................................................................................................................................... + // sqrdmulh v8.4s, v8.4s, v26.4s // ..................................................................................................................................................................................................|..............*..................................................................................................................................................... + // mls v16.4s, v8.4s, v29.4s // ..................................................................................................................................................................................................|.........................*.......................................................................................................................................... + // mul v17.4s, v9.4s, v25.4s // ..................................................................................................................................................................................................|................................................................................*................................................................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ..................................................................................................................................................................................................|...............................................................................*.................................................................................... + // mls v17.4s, v9.4s, v29.4s // ..................................................................................................................................................................................................|...................................................................................*................................................................................ + // mul v18.4s, v10.4s, v25.4s // ..................................................................................................................................................................................................|.......................*............................................................................................................................................ + // sqrdmulh v10.4s, v10.4s, v26.4s // ..................................................................................................................................................................................................|......................*............................................................................................................................................. + // mls v18.4s, v10.4s, v29.4s // ..................................................................................................................................................................................................|.............................*...................................................................................................................................... + // mul v19.4s, v11.4s, v25.4s // ..................................................................................................................................................................................................|............................................................*....................................................................................................... + // sqrdmulh v11.4s, v11.4s, v26.4s // ..................................................................................................................................................................................................|.........................................................*.......................................................................................................... + // mls v19.4s, v11.4s, v29.4s // ..................................................................................................................................................................................................|.......................................................................*............................................................................................ + // mul v20.4s, v12.4s, v25.4s // ................................*.................................................................................................................................................................|.....................................................................................................................*.............................................. + // sqrdmulh v12.4s, v12.4s, v26.4s // ..........................*.......................................................................................................................................................................|...............................................................................................................*.................................................... + // mls v20.4s, v12.4s, v29.4s // ...................................*..............................................................................................................................................................|........................................................................................................................*........................................... + // mul v21.4s, v13.4s, v25.4s // ....*.............................................................................................................................................................................................|.........................................................................................*.......................................................................... + // sqrdmulh v13.4s, v13.4s, v26.4s // ......*...........................................................................................................................................................................................|...........................................................................................*........................................................................ + // mls v21.4s, v13.4s, v29.4s // ................*.................................................................................................................................................................................|.....................................................................................................*.............................................................. + // mul v22.4s, v14.4s, v25.4s // ..................................................................................................................................................................................................|...............................................................*.................................................................................................... + // sqrdmulh v14.4s, v14.4s, v26.4s // ..................................................................................................................................................................................................|.................................................................*.................................................................................................. + // mls v22.4s, v14.4s, v29.4s // ..................................................................................................................................................................................................|..........................................................................*......................................................................................... + // mul v23.4s, v15.4s, v25.4s // ......................*...........................................................................................................................................................................|...........................................................................................................*........................................................ + // sqrdmulh v15.4s, v15.4s, v26.4s // .............*....................................................................................................................................................................................|..................................................................................................*................................................................. + // mls v23.4s, v15.4s, v29.4s // ........................*.........................................................................................................................................................................|.............................................................................................................*...................................................... + // cmge v27.4s, v31.4s, v16.4s // ..................................................................................................................................................................................................|..........................................*......................................................................................................................... + // cmge v28.4s, v16.4s, v30.4s // ..................................................................................................................................................................................................|............................................*....................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|................................................*................................................................................................................... + // mls v16.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|.....................................................*.............................................................................................................. + // cmge v27.4s, v31.4s, v17.4s // ........*.........................................................................................................................................................................................|.............................................................................................*...................................................................... + // cmge v28.4s, v17.4s, v30.4s // ...........*......................................................................................................................................................................................|................................................................................................*................................................................... + // sub v28.4s, v27.4s, v28.4s // ...............*..................................................................................................................................................................................|....................................................................................................*............................................................... + // mls v17.4s, v28.4s, v29.4s // ....................*.............................................................................................................................................................................|.........................................................................................................*.......................................................... + // cmge v27.4s, v31.4s, v18.4s // ..................................................................................................................................................................................................|................................*................................................................................................................................... + // cmge v28.4s, v18.4s, v30.4s // ..................................................................................................................................................................................................|..................................*................................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|.....................................*.............................................................................................................................. + // mls v18.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|.........................................*.......................................................................................................................... + // cmge v27.4s, v31.4s, v19.4s // .....*............................................................................................................................................................................................|..........................................................................................*......................................................................... + // cmge v28.4s, v19.4s, v30.4s // .......*..........................................................................................................................................................................................|............................................................................................*....................................................................... + // sub v28.4s, v27.4s, v28.4s // ..............*...................................................................................................................................................................................|...................................................................................................*................................................................ + // mls v19.4s, v28.4s, v29.4s // .............................................*....................................................................................................................................................|..................................................................................................................................*................................. + // cmge v27.4s, v31.4s, v20.4s // ............................................*.....................................................................................................................................................|.................................................................................................................................*.................................. + // cmge v28.4s, v20.4s, v30.4s // ..............................................*...................................................................................................................................................|...................................................................................................................................*................................ + // sub v28.4s, v27.4s, v28.4s // ...................................................*..............................................................................................................................................|........................................................................................................................................*........................... + // mls v20.4s, v28.4s, v29.4s // ......................................................*...........................................................................................................................................|...........................................................................................................................................*........................ + // cmge v27.4s, v31.4s, v21.4s // ..................................*...............................................................................................................................................................|.......................................................................................................................*............................................ + // cmge v28.4s, v21.4s, v30.4s // .........................*........................................................................................................................................................................|..............................................................................................................*..................................................... + // sub v28.4s, v27.4s, v28.4s // .......................................*..........................................................................................................................................................|............................................................................................................................*....................................... + // mls v21.4s, v28.4s, v29.4s // ...............................................................*..................................................................................................................................|....................................................................................................................................................*............... + // cmge v27.4s, v31.4s, v22.4s // ...................*..............................................................................................................................................................................|........................................................................................................*........................................................... + // cmge v28.4s, v22.4s, v30.4s // .................*................................................................................................................................................................................|......................................................................................................*............................................................. + // sub v28.4s, v27.4s, v28.4s // .......................*..........................................................................................................................................................................|............................................................................................................*....................................................... + // mls v22.4s, v28.4s, v29.4s // ........................................*.........................................................................................................................................................|.............................................................................................................................*...................................... + // cmge v27.4s, v31.4s, v23.4s // ...............................*..................................................................................................................................................................|....................................................................................................................*............................................... + // cmge v28.4s, v23.4s, v30.4s // ..........................................*.......................................................................................................................................................|...............................................................................................................................*.................................... + // sub v28.4s, v27.4s, v28.4s // ................................................*.................................................................................................................................................|.....................................................................................................................................*.............................. + // mls v23.4s, v28.4s, v29.4s // ...........................................................*......................................................................................................................................|................................................................................................................................................*................... + // str q16, [x1], #(16) // ..................................................................................................................................................................................................|........................................................*........................................................................................................... + // str q17, [x1, #(-16 + 1*(512/8))] // ..............................*...................................................................................................................................................................|...................................................................................................................*................................................ + // str q18, [x1, #(-16 + 2*(512/8))] // ..................................................................................................................................................................................................|..............................................*..................................................................................................................... + // str q19, [x1, #(-16 + 3*(512/8))] // ........................................................*.........................................................................................................................................|.............................................................................................................................................*...................... + // str q20, [x1, #(-16 + 4*(512/8))] // ..................................................................*...............................................................................................................................|.......................................................................................................................................................*............ + // str q21, [x1, #(-16 + 5*(512/8))] // ..........................................................................*.......................................................................................................................|...............................................................................................................................................................*.... + // str q22, [x1, #(-16 + 6*(512/8))] // ...............................................*..................................................................................................................................................|....................................................................................................................................*............................... + // str q23, [x1, #(-16 + 7*(512/8))] // .......................................................................*..........................................................................................................................|............................................................................................................................................................*....... + + sub count, count, #1 + cbnz count, layer1234_start + mls v15.4S, v24.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + add v24.4S, v10.4S, v14.4S // ...........................................................................................................*............................................................................................................................................................................ + sub v20.4S, v27.4S, v19.4S // .......................................................................................................................................................*................................................................................................................................ + sub v10.4S, v12.4S, v16.4S // ............................................................................................................................................................*........................................................................................................................... + sqrdmulh v14.4S, v18.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + add v12.4S, v12.4S, v16.4S // .............................................................................................................................................................*.......................................................................................................................... + mul v16.4S, v18.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + sub v18.4S, v11.4S, v15.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v11.4S, v15.4S // ..................................................................................................................................................................*..................................................................................................................... + sqrdmulh v15.4S, v21.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + add v11.4S, v27.4S, v19.4S // ........................................................................................................................................................*............................................................................................................................... + mul v19.4S, v21.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + mul v27.4S, v20.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + mls v19.4S, v15.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + mls v16.4S, v14.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + sqrdmulh v15.4S, v10.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + sub v21.4S, v19.4S, v22.4S // ......................................................................................................................................................................*................................................................................................................. + add v14.4S, v19.4S, v22.4S // .......................................................................................................................................................................*................................................................................................................ + sqrdmulh v22.4S, v20.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + cmge v19.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + mul v20.4S, v10.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + cmge v10.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + mls v20.4S, v15.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + sub v15.4S, v24.4S, v28.4S // ..................................................................................................................................................*..................................................................................................................................... + sub v10.4S, v19.4S, v10.4S // ..................................................................................................................................................................................*..................................................................................................... + mul v19.4S, v17.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + mls v16.4S, v10.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + add v10.4S, v24.4S, v28.4S // ...................................................................................................................................................*.................................................................................................................................... + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + sqrdmulh v24.4S, v17.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + cmge v16.4S, v20.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + cmge v17.4S, v31.4S, v20.4S // ................................................................................................................................................................................................*....................................................................................... + mls v27.4S, v22.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + sub v22.4S, v17.4S, v16.4S // ..................................................................................................................................................................................................*..................................................................................... + sqrdmulh v17.4S, v18.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + mul v28.4S, v18.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + cmge v18.4S, v31.4S, v27.4S // ............................................................................................................................................................................................*........................................................................................... + mls v20.4S, v22.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + cmge v22.4S, v27.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + mls v19.4S, v24.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + sub v22.4S, v18.4S, v22.4S // ..............................................................................................................................................................................................*......................................................................................... + sqrdmulh v24.4S, v15.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + str q20, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + mul v20.4S, v15.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sqrdmulh v15.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + sqrdmulh v8.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + mul v10.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + mul v18.4S, v21.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + sqrdmulh v21.4S, v21.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mls v20.4S, v24.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + mls v27.4S, v22.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mls v16.4S, v15.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + add v15.4S, v23.4S, v19.4S // ............................................................................................................................................................................*........................................................................................................... + cmge v22.4S, v20.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + mls v28.4S, v17.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + str q27, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + sub v27.4S, v23.4S, v19.4S // ...........................................................................................................................................................................*............................................................................................................ + mls v18.4S, v21.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + cmge v24.4S, v31.4S, v20.4S // ........................................................................................................................................................................................*............................................................................................... + mls v10.4S, v8.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + cmge v8.4S, v28.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + cmge v23.4S, v31.4S, v28.4S // ....................................................................................................................................................................................................*................................................................................... + sub v22.4S, v24.4S, v22.4S // ..........................................................................................................................................................................................*............................................................................................. + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + cmge v24.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + sqrdmulh v9.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + cmge v19.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + sub v8.4S, v23.4S, v8.4S // ......................................................................................................................................................................................................*................................................................................. + mls v20.4S, v22.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + cmge v22.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + sub v24.4S, v24.4S, v19.4S // ..................................................................................................................................................................................................................................................*..................................... + mls v28.4S, v8.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + cmge v8.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................*............................................................................... + cmge v23.4S, v10.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + mls v16.4S, v24.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + cmge v24.4S, v31.4S, v10.4S // ........................................................................................................................................................................................................................................................*............................... + str q20, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + sub v20.4S, v8.4S, v22.4S // ..........................................................................................................................................................................................................*............................................................................. + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + str q28, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + sub v8.4S, v24.4S, v23.4S // ..........................................................................................................................................................................................................................................................*............................. + sqrdmulh v22.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + sqrdmulh v16.4S, v27.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + mls v19.4S, v22.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + mls v10.4S, v8.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + sqrdmulh v23.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + cmge v8.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + cmge v14.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + str q10, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + sqrdmulh v28.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + mls v17.4S, v9.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + mls v18.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + sqrdmulh v9.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + cmge v24.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + str q18, [x1, #880] // ......................................................................................................................................................................................................................*................................................................. + cmge v18.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + sqrdmulh v12.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + mul v15.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sub v24.4S, v24.4S, v18.4S // ......................................................................................................................................................................................................................................................*................................. + mul v27.4S, v27.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + mls v27.4S, v16.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + mls v20.4S, v9.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + sub v9.4S, v14.4S, v8.4S // ..............................................................................................................................................................................................................................................................*......................... + mls v21.4S, v28.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v14.4S, v31.4S, v27.4S // ............................................................................................................................................................................................................*........................................................................... + mls v17.4S, v24.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + cmge v10.4S, v27.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + cmge v28.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + cmge v17.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + mls v15.4S, v12.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + cmge v16.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + cmge v12.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + sub v28.4S, v17.4S, v28.4S // ..................................................................................................................................................................................................................................................................*..................... + mls v22.4S, v23.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + sub v10.4S, v14.4S, v10.4S // ..............................................................................................................................................................................................................*......................................................................... + mls v19.4S, v9.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + sub v14.4S, v16.4S, v12.4S // ......................................................................................................................................................................................................................................................................*................. + cmge v17.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + mls v20.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + cmge v16.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................................*........... + cmge v28.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + mls v21.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + cmge v14.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + sub v19.4S, v16.4S, v17.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v27.4S, v10.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + sub v14.4S, v14.4S, v28.4S // ..........................................................................................................................................................................................................................................................................*............. + mls v15.4S, v19.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + mls v22.4S, v14.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + str q27, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ + str q15, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_a55.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_a55.s new file mode 100644 index 0000000..e1ba3ba --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_a55.s @@ -0,0 +1,1718 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_opt_a55 + .global _intt_dilithium_1234_5678_opt_a55 + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_opt_a55: +_intt_dilithium_1234_5678_opt_a55: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 + ld4 {v7.4S, v8.4S, v9.4S, v10.4S}, [x0] // *.......................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q17, [x3, #80] // ............*.............................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v3.4S, v9.4S, v10.4S // ...*....................................... + // gap // ........................................... + add v6.4S, v9.4S, v10.4S // ....*...................................... + // gap // ........................................... + sub v28.4S, v7.4S, v8.4S // .*......................................... + // gap // ........................................... + sqrdmulh v17.4S, v3.4S, v17.4S // ..............*............................ + // gap // ........................................... + ldr q21, [x3, #64] // ...........*............................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q14, [x3, #48] // .......*................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q2, [x3, #32] // ......*.................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mul v24.4S, v3.4S, v21.4S // .............*............................. + // gap // ........................................... + sqrdmulh v14.4S, v28.4S, v14.4S // ..........*................................ + // gap // ........................................... + mul v28.4S, v28.4S, v2.4S // .........*................................. + // gap // ........................................... + ldr q21, [x3, #16] // ..................*........................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v31.4S, v7.4S, v8.4S // ..*........................................ + // gap // ........................................... + mls v24.4S, v17.4S, v29.4S // ................*.......................... + // gap // ........................................... + mls v28.4S, v14.4S, v29.4S // ...............*........................... + // gap // ........................................... + sub v3.4S, v31.4S, v6.4S // .....*..................................... + // gap // ........................................... + ldr q14, [x3], #(6*16) // .................*......................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v17.4S, v28.4S, v24.4S // ...................*....................... + // gap // ........................................... + sqrdmulh v2.4S, v3.4S, v21.4S // .....................*..................... + // gap // ........................................... + mul v3.4S, v3.4S, v14.4S // ....................*...................... + // gap // ........................................... + sqrdmulh v21.4S, v17.4S, v21.4S // .......................*................... + // gap // ........................................... + mul v14.4S, v17.4S, v14.4S // ......................*.................... + // gap // ........................................... + add v6.4S, v31.4S, v6.4S // ........*.................................. + // gap // ........................................... + add v17.4S, v28.4S, v24.4S // .........................*................. + // gap // ........................................... + mls v3.4S, v2.4S, v29.4S // ........................*.................. + // gap // ........................................... + mls v14.4S, v21.4S, v29.4S // ..........................*................ + // gap // ........................................... + trn2 v2.4S, v6.4S, v17.4S // ............................*.............. + // gap // ........................................... + trn1 v24.4S, v6.4S, v17.4S // ...........................*............... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn2 v21.4S, v3.4S, v14.4S // ..............................*............ + // gap // ........................................... + trn1 v14.4S, v3.4S, v14.4S // .............................*............. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn1 v17.2D, v2.2D, v21.2D // ..................................*........ + // gap // ........................................... + trn2 v3.2D, v2.2D, v21.2D // .................................*......... + // gap // ........................................... + trn1 v2.2D, v24.2D, v14.2D // ................................*.......... + // gap // ........................................... + trn2 v31.2D, v24.2D, v14.2D // ...............................*........... + // gap // ........................................... + add v15.4S, v2.4S, v17.4S // ....................................*...... + // gap // ........................................... + add v26.4S, v31.4S, v3.4S // ...................................*....... + // gap // ........................................... + ldr q13, [x4], #8 // .......................................*... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v14.4S, v15.4S, v26.4S // .....................................*..... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + srshr v21.4S, v14.4S, #23 // ......................................*.... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v14.4S, v21.4S, v29.4S // ........................................*.. + // gap // ........................................... + ldr q1, [x4], #16 // .........................................*. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + str q14, [x0], #(16*4) // ..........................................* + // gap // ........................................... + + // original source code + // ld4 {v6.4S, v7.4S, v8.4S, v9.4S}, [x0] // *.......................................... + // sub v14.4S, v6.4S, v7.4S // ....*...................................... + // add v2.4S, v6.4S, v7.4S // .............*............................. + // sub v3.4S, v8.4S, v9.4S // ..*........................................ + // add v24.4S, v8.4S, v9.4S // ...*....................................... + // sub v17.4S, v2.4S, v24.4S // ................*.......................... + // ldr q21, [x3, #32] // ........*.................................. + // ldr q26, [x3, #48] // .......*................................... + // add v2.4S, v2.4S, v24.4S // .......................*................... + // mul v21.4S, v14.4S, v21.4S // ...........*............................... + // sqrdmulh v14.4S, v14.4S, v26.4S // ..........*................................ + // ldr q24, [x3, #64] // ......*.................................... + // ldr q26, [x3, #80] // .*......................................... + // mul v24.4S, v3.4S, v24.4S // .........*................................. + // sqrdmulh v3.4S, v3.4S, v26.4S // .....*..................................... + // mls v21.4S, v14.4S, v29.4S // ...............*........................... + // mls v24.4S, v3.4S, v29.4S // ..............*............................ + // ldr q14, [x3], #(6*16) // .................*......................... + // ldr q3, [x3, #-80] // ............*.............................. + // sub v6.4S, v21.4S, v24.4S // ..................*........................ + // mul v31.4S, v17.4S, v14.4S // ....................*...................... + // sqrdmulh v17.4S, v17.4S, v3.4S // ...................*....................... + // mul v14.4S, v6.4S, v14.4S // ......................*.................... + // sqrdmulh v3.4S, v6.4S, v3.4S // .....................*..................... + // mls v31.4S, v17.4S, v29.4S // .........................*................. + // add v17.4S, v21.4S, v24.4S // ........................*.................. + // mls v14.4S, v3.4S, v29.4S // ..........................*................ + // trn1 v3.4S, v2.4S, v17.4S // ............................*.............. + // trn2 v17.4S, v2.4S, v17.4S // ...........................*............... + // trn1 v2.4S, v31.4S, v14.4S // ..............................*............ + // trn2 v14.4S, v31.4S, v14.4S // .............................*............. + // trn2 v31.2D, v3.2D, v2.2D // ..................................*........ + // trn1 v2.2D, v3.2D, v2.2D // .................................*......... + // trn2 v3.2D, v17.2D, v14.2D // ................................*.......... + // trn1 v17.2D, v17.2D, v14.2D // ...............................*........... + // add v26.4S, v31.4S, v3.4S // ....................................*...... + // add v15.4S, v2.4S, v17.4S // ...................................*....... + // add v14.4S, v15.4S, v26.4S // ......................................*.... + // srshr v21.4S, v14.4S, #23 // .......................................*... + // ldr q13, [x4], #8 // .....................................*..... + // mls v14.4S, v21.4S, v29.4S // ........................................*.. + // ldr q1, [x4], #16 // .........................................*. + // str q14, [x0], #(16*4) // ..........................................* + + sub count, count, #1 +layer5678_start: + ld4 {v6.4S, v7.4S, v8.4S, v9.4S}, [x0] // e................................................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v17.4S, v2.4S, v17.4S // .....................................*........................... + // gap // ................................................................. + sub v21.4S, v31.4S, v3.4S // ..........................................*...................... + // gap // ................................................................. + sub v14.4S, v6.4S, v7.4S // .......e......................................................... + // gap // ................................................................. + add v2.4S, v6.4S, v7.4S // ........e........................................................ + // gap // ................................................................. + sub v3.4S, v8.4S, v9.4S // ............e.................................................... + // gap // ................................................................. + add v24.4S, v8.4S, v9.4S // .............e................................................... + // gap // ................................................................. + mul v28.4S, v17.4S, v1.S[0] // .......................................*......................... + // gap // ................................................................. + sqrdmulh v17.4S, v17.4S, v1.S[1] // ........................................*........................ + // gap // ................................................................. + mul v6.4S, v21.4S, v1.S[2] // ............................................*.................... + // gap // ................................................................. + sqrdmulh v21.4S, v21.4S, v1.S[3] // .............................................*................... + // gap // ................................................................. + sub v31.4S, v15.4S, v26.4S // ...............................................*................. + // gap // ................................................................. + mls v28.4S, v17.4S, v29.4S // .........................................*....................... + // gap // ................................................................. + sub v17.4S, v2.4S, v24.4S // .................e............................................... + // gap // ................................................................. + mls v6.4S, v21.4S, v29.4S // ..............................................*.................. + // gap // ................................................................. + ldr q21, [x3, #32] // ...e............................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q26, [x3, #48] // ....e............................................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + add v2.4S, v2.4S, v24.4S // ..................e.............................................. + // gap // ................................................................. + mul v21.4S, v14.4S, v21.4S // .........e....................................................... + // gap // ................................................................. + sqrdmulh v14.4S, v14.4S, v26.4S // ..........e...................................................... + // gap // ................................................................. + ldr q24, [x3, #64] // .....e........................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q26, [x3, #80] // ......e.......................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v1.4S, v31.4S, v13.S[0] // .................................................*............... + // gap // ................................................................. + mul v24.4S, v3.4S, v24.4S // ..............e.................................................. + // gap // ................................................................. + sqrdmulh v3.4S, v3.4S, v26.4S // ...............e................................................. + // gap // ................................................................. + sqrdmulh v31.4S, v31.4S, v13.S[1] // ..................................................*.............. + // gap // ................................................................. + sub v26.4S, v28.4S, v6.4S // ....................................................*............ + // gap // ................................................................. + add v28.4S, v28.4S, v6.4S // .....................................................*........... + // gap // ................................................................. + mls v21.4S, v14.4S, v29.4S // ...........e..................................................... + // gap // ................................................................. + mls v1.4S, v31.4S, v29.4S // ...................................................*............. + // gap // ................................................................. + mls v24.4S, v3.4S, v29.4S // ................e................................................ + // gap // ................................................................. + ldr q14, [x3], #(6*16) // .e............................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q3, [x3, #-80] // ..e.............................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v6.4S, v21.4S, v24.4S // ......................e.......................................... + // gap // ................................................................. + mul v31.4S, v17.4S, v14.4S // ...................e............................................. + // gap // ................................................................. + sqrdmulh v17.4S, v17.4S, v3.4S // ....................e............................................ + // gap // ................................................................. + mul v14.4S, v6.4S, v14.4S // ........................e........................................ + // gap // ................................................................. + sqrdmulh v3.4S, v6.4S, v3.4S // .........................e....................................... + // gap // ................................................................. + sqrdmulh v6.4S, v26.4S, v13.S[1] // .......................................................*......... + // gap // ................................................................. + mls v31.4S, v17.4S, v29.4S // .....................e........................................... + // gap // ................................................................. + add v17.4S, v21.4S, v24.4S // .......................e......................................... + // gap // ................................................................. + mls v14.4S, v3.4S, v29.4S // ..........................e...................................... + // gap // ................................................................. + mul v21.4S, v26.4S, v13.S[0] // ......................................................*.......... + // gap // ................................................................. + trn1 v3.4S, v2.4S, v17.4S // ...........................e..................................... + // gap // ................................................................. + trn2 v17.4S, v2.4S, v17.4S // ............................e.................................... + // gap // ................................................................. + trn1 v2.4S, v31.4S, v14.4S // .............................e................................... + // gap // ................................................................. + srshr v24.4S, v28.4S, #23 // ...........................................................*..... + // gap // ................................................................. + trn2 v14.4S, v31.4S, v14.4S // ..............................e.................................. + // gap // ................................................................. + trn2 v31.2D, v3.2D, v2.2D // ...............................e................................. + // gap // ................................................................. + trn1 v2.2D, v3.2D, v2.2D // .................................e............................... + // gap // ................................................................. + trn2 v3.2D, v17.2D, v14.2D // ................................e................................ + // gap // ................................................................. + trn1 v17.2D, v17.2D, v14.2D // ..................................e.............................. + // gap // ................................................................. + add v26.4S, v31.4S, v3.4S // ...........................................e..................... + // gap // ................................................................. + add v15.4S, v2.4S, v17.4S // ......................................e.......................... + // gap // ................................................................. + mls v21.4S, v6.4S, v29.4S // ........................................................*........ + // gap // ................................................................. + str q1, [x0, #-32] // ...............................................................*. + // gap // ................................................................. + add v14.4S, v15.4S, v26.4S // ................................................e................ + // gap // ................................................................. + mls v28.4S, v24.4S, v29.4S // ............................................................*.... + // gap // ................................................................. + str q21, [x0, #-16] // ................................................................* + // gap // ................................................................. + srshr v21.4S, v14.4S, #23 // .........................................................e....... + // gap // ................................................................. + ldr q13, [x4], #8 // ...................................e............................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v14.4S, v21.4S, v29.4S // ..........................................................e...... + // gap // ................................................................. + str q28, [x0, #-48] // ..............................................................*.. + // gap // ................................................................. + ldr q1, [x4], #16 // ....................................e............................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q14, [x0], #(16*4) // .............................................................e... + // gap // ................................................................. + + // original source code + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0] // e................................................................e.............................................................. + // ldr q0, [x3], #(6*16) // ...............................e.................................|..............................e............................... + // ldr q4, [x3, #(-6*16 + 1*16)] // ................................e................................|...............................e.............................. + // ldr q1, [x3, #(-6*16 + 2*16)] // ...............e.................................................|..............e............................................... + // ldr q5, [x3, #(-6*16 + 3*16)] // ................e................................................|...............e.............................................. + // ldr q2, [x3, #(-6*16 + 4*16)] // ....................e............................................|...................e.......................................... + // ldr q6, [x3, #(-6*16 + 5*16)] // .....................e...........................................|....................e......................................... + // sub v24.4s, v8.4s, v9.4s // ...e.............................................................|..e........................................................... + // add v8.4s, v8.4s, v9.4s // ....e............................................................|...e.......................................................... + // mul v9.4s, v24.4s, v1.4s // ..................e..............................................|.................e............................................ + // sqrdmulh v24.4s, v24.4s, v5.4s // ...................e.............................................|..................e........................................... + // mls v9.4s, v24.4s, v29.4s // ............................e....................................|...........................e.................................. + // sub v24.4s, v10.4s, v11.4s // .....e...........................................................|....e......................................................... + // add v10.4s, v10.4s, v11.4s // ......e..........................................................|.....e........................................................ + // mul v11.4s, v24.4s, v2.4s // .......................e.........................................|......................e....................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ........................e........................................|.......................e...................................... + // mls v11.4s, v24.4s, v29.4s // ..............................e..................................|.............................e................................ + // sub v24.4s, v8.4s, v10.4s // .............e...................................................|............e................................................. + // add v8.4s, v8.4s, v10.4s // .................e...............................................|................e............................................. + // mul v10.4s, v24.4s, v0.4s // ..................................e..............................|.................................e............................ + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................e.............................|..................................e........................... + // mls v10.4s, v24.4s, v29.4s // .......................................e.........................|......................................e....................... + // sub v24.4s, v9.4s, v11.4s // .................................e...............................|................................e............................. + // add v9.4s, v9.4s, v11.4s // ........................................e........................|.......................................e...................... + // mul v11.4s, v24.4s, v0.4s // ....................................e............................|...................................e.......................... + // sqrdmulh v24.4s, v24.4s, v4.4s // .....................................e...........................|....................................e......................... + // mls v11.4s, v24.4s, v29.4s // .........................................e.......................|........................................e..................... + // trn1 v25.4s, v8.4s, v9.4s // ...........................................e.....................|..........................................e................... + // trn2 v26.4s, v8.4s, v9.4s // ............................................e....................|...........................................e.................. + // trn1 v27.4s, v10.4s, v11.4s // .............................................e...................|............................................e................. + // trn2 v28.4s, v10.4s, v11.4s // ...............................................e.................|..............................................e............... + // trn2 v10.2d, v25.2d, v27.2d // ................................................e................|...............................................e.............. + // trn2 v11.2d, v26.2d, v28.2d // ..................................................e..............|.................................................e............ + // trn1 v8.2d, v25.2d, v27.2d // .................................................e...............|................................................e............. + // trn1 v9.2d, v26.2d, v28.2d // ...................................................e.............|..................................................e........... + // ldr q1, [x4], #8 // ............................................................e....|...........................................................e.. + // ldr q0, [x4], #16 // ...............................................................e.|.............................................................. + // sub v24.4s, v8.4s, v9.4s // .*...............................................................|*............................................................. + // add v8.4s, v8.4s, v9.4s // .....................................................e...........|....................................................e......... + // mul v9.4s, v24.4s, v0.s[0] // .......*.........................................................|......*....................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........*........................................................|.......*...................................................... + // mls v9.4s, v24.4s, v29.4s // ............*....................................................|...........*.................................................. + // sub v24.4s, v10.4s, v11.4s // ..*..............................................................|.*............................................................ + // add v10.4s, v10.4s, v11.4s // ....................................................e............|...................................................e.......... + // mul v11.4s, v24.4s, v0.s[2] // .........*.......................................................|........*..................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..........*......................................................|.........*.................................................... + // mls v11.4s, v24.4s, v29.4s // ..............*..................................................|.............*................................................ + // sub v24.4s, v8.4s, v10.4s // ...........*.....................................................|..........*................................................... + // add v8.4s, v8.4s, v10.4s // ........................................................e........|.......................................................e...... + // mul v10.4s, v24.4s, v1.s[0] // ......................*..........................................|.....................*........................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................*.......................................|........................*..................................... + // mls v10.4s, v24.4s, v29.4s // .............................*...................................|............................*................................. + // sub v24.4s, v9.4s, v11.4s // ..........................*......................................|.........................*.................................... + // add v9.4s, v9.4s, v11.4s // ...........................*.....................................|..........................*................................... + // mul v11.4s, v24.4s, v1.s[0] // ..........................................*......................|.........................................*.................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................................*..........................|.....................................*........................ + // mls v11.4s, v24.4s, v29.4s // ......................................................*..........|.....................................................*........ + // srshr v24.4S, v8.4S, #23 // ...........................................................e.....|..........................................................e... + // mls v8.4s, v24.4s, v29.4s // .............................................................e...|............................................................e. + // srshr v24.4S, v9.4S, #23 // ..............................................*..................|.............................................*................ + // mls v9.4s, v24.4s, v29.4s // .........................................................*.......|........................................................*..... + // str q8, [x0], #(16*4) // ................................................................e|.............................................................. + // str q9, [x0, #(-16*4 + 1*16)] // ..............................................................*..|.............................................................* + // str q10, [x0, #(-16*4 + 2*16)] // .......................................................*.........|......................................................*....... + // str q11, [x0, #(-16*4 + 3*16)] // ..........................................................*......|.........................................................*.... + + sub count, count, #1 + cbnz count, layer5678_start + sub v16.4S, v31.4S, v3.4S // .*.................... + // gap // ...................... + sub v24.4S, v2.4S, v17.4S // *..................... + // gap // ...................... + sub v20.4S, v15.4S, v26.4S // ......*............... + // gap // ...................... + mul v17.4S, v16.4S, v1.S[2] // ....*................. + // gap // ...................... + sqrdmulh v28.4S, v16.4S, v1.S[3] // .....*................ + // gap // ...................... + mul v26.4S, v24.4S, v1.S[0] // ..*................... + // gap // ...................... + sqrdmulh v9.4S, v24.4S, v1.S[1] // ...*.................. + // gap // ...................... + sqrdmulh v2.4S, v20.4S, v13.S[1] // ..........*........... + // gap // ...................... + mls v17.4S, v28.4S, v29.4S // ........*............. + // gap // ...................... + mul v21.4S, v20.4S, v13.S[0] // .........*............ + // gap // ...................... + mls v26.4S, v9.4S, v29.4S // .......*.............. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v21.4S, v2.4S, v29.4S // .............*........ + // gap // ...................... + add v3.4S, v26.4S, v17.4S // ............*......... + // gap // ...................... + sub v15.4S, v26.4S, v17.4S // ...........*.......... + // gap // ...................... + // gap // ...................... + // gap // ...................... + srshr v12.4S, v3.4S, #23 // ................*..... + // gap // ...................... + sqrdmulh v28.4S, v15.4S, v13.S[1] // ..............*....... + // gap // ...................... + mul v17.4S, v15.4S, v13.S[0] // ...............*...... + // gap // ...................... + mls v3.4S, v12.4S, v29.4S // ...................*.. + // gap // ...................... + str q21, [x0, #-32] // ..................*... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v17.4S, v28.4S, v29.4S // .................*.... + // gap // ...................... + str q3, [x0, #-48] // .....................* + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + str q17, [x0, #-16] // ....................*. + // gap // ...................... + + // original source code + // sub v17.4S, v2.4S, v17.4S // .*.................... + // sub v21.4S, v31.4S, v3.4S // *..................... + // mul v28.4S, v17.4S, v1.S[0] // .....*................ + // sqrdmulh v17.4S, v17.4S, v1.S[1] // ......*............... + // mul v6.4S, v21.4S, v1.S[2] // ...*.................. + // sqrdmulh v21.4S, v21.4S, v1.S[3] // ....*................. + // sub v31.4S, v15.4S, v26.4S // ..*................... + // mls v28.4S, v17.4S, v29.4S // ..........*........... + // mls v6.4S, v21.4S, v29.4S // ........*............. + // mul v1.4S, v31.4S, v13.S[0] // .........*............ + // sqrdmulh v31.4S, v31.4S, v13.S[1] // .......*.............. + // sub v26.4S, v28.4S, v6.4S // .............*........ + // add v28.4S, v28.4S, v6.4S // ............*......... + // mls v1.4S, v31.4S, v29.4S // ...........*.......... + // sqrdmulh v6.4S, v26.4S, v13.S[1] // ...............*...... + // mul v21.4S, v26.4S, v13.S[0] // ................*..... + // srshr v24.4S, v28.4S, #23 // ..............*....... + // mls v21.4S, v6.4S, v29.4S // ...................*.. + // str q1, [x0, #-32] // ..................*... + // mls v28.4S, v24.4S, v29.4S // .................*.... + // str q21, [x0, #-16] // .....................* + // str q28, [x0, #-48] // ....................*. + + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 + ldr q15, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + ldr q9, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + ldr q19, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + add v22.4S, v9.4S, v15.4S // ......................*................................................................................................................................................................................................................................................................. + ldr q20, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q11, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + sub v17.4S, v20.4S, v19.4S // ...............................*........................................................................................................................................................................................................................................................ + ldr q16, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + sqrdmulh v28.4S, v17.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... + mul v24.4S, v17.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... + sub v8.4S, v11.4S, v16.4S // ..........................*............................................................................................................................................................................................................................................................. + ldr q17, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + mul v13.4S, v8.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v8.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... + ldr q21, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + mls v24.4S, v28.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + mls v13.4S, v14.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + add v28.4S, v21.4S, v17.4S // ...............................................*........................................................................................................................................................................................................................................ + sub v14.4S, v21.4S, v17.4S // ..............................................*......................................................................................................................................................................................................................................... + add v23.4S, v20.4S, v19.4S // ................................*....................................................................................................................................................................................................................................................... + sub v21.4S, v13.4S, v24.4S // .......................................................................*................................................................................................................................................................................................................ + add v20.4S, v11.4S, v16.4S // ...........................*............................................................................................................................................................................................................................................................ + add v13.4S, v13.4S, v24.4S // ........................................................................*............................................................................................................................................................................................................... + sqrdmulh v17.4S, v21.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. + mul v11.4S, v21.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. + sub v21.4S, v20.4S, v23.4S // ..................................................................*..................................................................................................................................................................................................................... + ldr q16, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + mls v11.4S, v17.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + ldr q18, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + ldr q17, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + ldr q12, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + add v24.4S, v18.4S, v17.4S // .....................................*.................................................................................................................................................................................................................................................. + sub v17.4S, v18.4S, v17.4S // ....................................*................................................................................................................................................................................................................................................... + add v8.4S, v12.4S, v16.4S // ..........................................*............................................................................................................................................................................................................................................. + sub v27.4S, v12.4S, v16.4S // .........................................*.............................................................................................................................................................................................................................................. + sqrdmulh v12.4S, v14.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... + add v10.4S, v24.4S, v8.4S // .............................................................................*.......................................................................................................................................................................................................... + sub v16.4S, v24.4S, v8.4S // ............................................................................*........................................................................................................................................................................................................... + sub v8.4S, v9.4S, v15.4S // .....................*.................................................................................................................................................................................................................................................................. + mul v19.4S, v14.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... + sqrdmulh v18.4S, v16.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + mul v14.4S, v16.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... + ldr q15, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + mls v19.4S, v12.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + mul v24.4S, v17.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. + ldr q9, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + sqrdmulh v16.4S, v17.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + add v23.4S, v20.4S, v23.4S // ...................................................................*.................................................................................................................................................................................................................... + sub v12.4S, v15.4S, v9.4S // ...................................................*.................................................................................................................................................................................................................................... + add v15.4S, v15.4S, v9.4S // ....................................................*................................................................................................................................................................................................................................... + mls v24.4S, v16.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + mul v20.4S, v12.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. + add v9.4S, v28.4S, v15.4S // .......................................................................................*................................................................................................................................................................................................ + sqrdmulh v12.4S, v12.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. + sqrdmulh v16.4S, v21.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. + add v17.4S, v10.4S, v9.4S // .....................................................................................................................*.................................................................................................................................................................. + mls v20.4S, v12.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + mls v14.4S, v18.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + mul v12.4S, v21.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... + sub v21.4S, v28.4S, v15.4S // ......................................................................................*................................................................................................................................................................................................. + ldr q28, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + mls v12.4S, v16.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + ldr q16, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + sqrdmulh v15.4S, v27.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... + mul v27.4S, v27.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ + sub v18.4S, v16.4S, v28.4S // ................*....................................................................................................................................................................................................................................................................... + sub v10.4S, v10.4S, v9.4S // ....................................................................................................................*................................................................................................................................................................... + add v9.4S, v16.4S, v28.4S // .................*...................................................................................................................................................................................................................................................................... + sqrdmulh v28.4S, v18.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... + mul v16.4S, v18.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... + sqrdmulh v18.4S, v8.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... + mul v8.4S, v8.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ + mls v27.4S, v15.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + mls v16.4S, v28.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + sub v28.4S, v9.4S, v22.4S // ........................................................*............................................................................................................................................................................................................................... + mls v8.4S, v18.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + add v9.4S, v9.4S, v22.4S // .........................................................*.............................................................................................................................................................................................................................. + sqrdmulh v15.4S, v28.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ + sub v18.4S, v24.4S, v27.4S // .................................................................................*...................................................................................................................................................................................................... + add v22.4S, v16.4S, v8.4S // ..............................................................*......................................................................................................................................................................................................................... + sub v16.4S, v16.4S, v8.4S // .............................................................*.......................................................................................................................................................................................................................... + add v8.4S, v24.4S, v27.4S // ..................................................................................*..................................................................................................................................................................................................... + add v27.4S, v22.4S, v13.4S // ......................................................................................................*................................................................................................................................................................................. + sqrdmulh v24.4S, v16.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... + sub v13.4S, v22.4S, v13.4S // .....................................................................................................*.................................................................................................................................................................................. + mul v16.4S, v16.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ + mul v22.4S, v28.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. + sqrdmulh v28.4S, v13.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... + mul v13.4S, v13.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ + mls v16.4S, v24.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + sqrdmulh v24.4S, v10.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ + mls v22.4S, v15.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + mls v13.4S, v28.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + sub v15.4S, v16.4S, v11.4S // ...............................................................................................................*........................................................................................................................................................................ + add v28.4S, v16.4S, v11.4S // ................................................................................................................*....................................................................................................................................................................... + sub v11.4S, v19.4S, v20.4S // ...........................................................................................*............................................................................................................................................................................................ + sqrdmulh v16.4S, v15.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + mul v15.4S, v15.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + add v19.4S, v19.4S, v20.4S // ............................................................................................*........................................................................................................................................................................................... + mul v20.4S, v10.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. + sqrdmulh v10.4S, v11.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... + mul v11.4S, v11.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... + mls v15.4S, v16.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + mls v20.4S, v24.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + add v24.4S, v8.4S, v19.4S // ..........................................................................................................................*............................................................................................................................................................. + sub v19.4S, v8.4S, v19.4S // .........................................................................................................................*.............................................................................................................................................................. + sqrdmulh v8.4S, v21.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. + mul v21.4S, v21.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... + mls v11.4S, v10.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + sqrdmulh v16.4S, v19.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + add v10.4S, v22.4S, v12.4S // ...........................................................................................................*............................................................................................................................................................................ + mls v21.4S, v8.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + add v8.4S, v9.4S, v23.4S // .................................................................................................*...................................................................................................................................................................................... + mul v19.4S, v19.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ + sub v9.4S, v9.4S, v23.4S // ................................................................................................*....................................................................................................................................................................................... + sub v23.4S, v8.4S, v17.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v17.4S // .........................................................................................................................................*.............................................................................................................................................. + mls v19.4S, v16.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + sqrdmulh v16.4S, v23.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + mul v17.4S, v23.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + sqrdmulh v23.4S, v9.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... + mul v9.4S, v9.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... + sub v22.4S, v22.4S, v12.4S // ..........................................................................................................*............................................................................................................................................................................. + mls v17.4S, v16.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + sub v12.4S, v27.4S, v24.4S // .............................................................................................................................................*.......................................................................................................................................... + mls v9.4S, v23.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + add v27.4S, v27.4S, v24.4S // ..............................................................................................................................................*......................................................................................................................................... + cmge v23.4S, v17.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + cmge v16.4S, v31.4S, v17.4S // ................................................................................................................................................................................*....................................................................................................... + sub v24.4S, v9.4S, v20.4S // ............................................................................................................................................................*........................................................................................................................... + sub v16.4S, v16.4S, v23.4S // ..................................................................................................................................................................................*..................................................................................................... + add v23.4S, v9.4S, v20.4S // .............................................................................................................................................................*.......................................................................................................................... + sqrdmulh v20.4S, v12.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + mls v17.4S, v16.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + mul v9.4S, v12.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + sqrdmulh v16.4S, v22.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + mul v12.4S, v22.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + str q17, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + mul v17.4S, v24.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + sqrdmulh v22.4S, v24.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + sub count, count, #1 +layer1234_start: + mul v24.4S, v18.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + sqrdmulh v18.4S, v18.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + mls v12.4S, v16.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + sqrdmulh v16.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + mul v8.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + mls v24.4S, v18.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + add v18.4S, v14.4S, v21.4S // ...............................................................................................................................*........................................................................................................................................................ + mls v9.4S, v20.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + mls v8.4S, v16.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sub v16.4S, v24.4S, v11.4S // ...................................................................................................................................*.................................................................................................................................................... + add v11.4S, v24.4S, v11.4S // ....................................................................................................................................*................................................................................................................................................... + sub v20.4S, v14.4S, v21.4S // ..............................................................................................................................*......................................................................................................................................................... + mul v24.4S, v16.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + sqrdmulh v16.4S, v16.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + sub v21.4S, v28.4S, v11.4S // .......................................................................................................................................................*................................................................................................................................ + add v28.4S, v28.4S, v11.4S // ........................................................................................................................................................*............................................................................................................................... + cmge v14.4S, v8.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + mls v24.4S, v16.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + cmge v16.4S, v31.4S, v8.4S // ................................................................................................................................................................................................................................................*....................................... + mul v11.4S, v20.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + sub v14.4S, v16.4S, v14.4S // ..................................................................................................................................................................................................................................................*..................................... + add v16.4S, v15.4S, v24.4S // ............................................................................................................................................................................*........................................................................................................... + sub v15.4S, v15.4S, v24.4S // ...........................................................................................................................................................................*............................................................................................................ + sqrdmulh v20.4S, v20.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + mul v24.4S, v21.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + mls v8.4S, v14.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + sub v14.4S, v10.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... + mls v11.4S, v20.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + cmge v20.4S, v9.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + str q8, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + cmge v8.4S, v31.4S, v9.4S // ....................................................................................................................................................................................*................................................................................................... + sqrdmulh v21.4S, v21.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + sub v8.4S, v8.4S, v20.4S // ......................................................................................................................................................................................*................................................................................................. + sqrdmulh v20.4S, v23.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + add v10.4S, v10.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... + mls v9.4S, v8.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + mls v24.4S, v21.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + sqrdmulh v8.4S, v16.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + mul v16.4S, v16.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sqrdmulh v18.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + mul v21.4S, v27.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + sqrdmulh v27.4S, v27.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mls v16.4S, v8.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + mul v10.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + mls v17.4S, v22.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + mls v21.4S, v27.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + mul v8.4S, v23.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + mls v10.4S, v18.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + cmge v18.4S, v16.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v27.4S, v31.4S, v16.4S // ............................................................................................................................................................................................................................................................................*........... + cmge v23.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................*................................... + mls v8.4S, v20.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + cmge v20.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + sub v27.4S, v27.4S, v18.4S // ..............................................................................................................................................................................................................................................................................*......... + sub v20.4S, v23.4S, v20.4S // ......................................................................................................................................................................................................................................................*................................. + ldr q18, [x1, #320] // .....e.................................................................................................................................................................................................................................................................................. + ldr q22, [x1, #256] // ....e................................................................................................................................................................................................................................................................................... + cmge v23.4S, v31.4S, v24.4S // ............................................................................................................................................................................................*........................................................................................... + str q9, [x1, #560] // .................................................................................................................................................................................................................*...................................................................... + add v9.4S, v22.4S, v18.4S // ...........................e............................................................................................................................................................................................................................................................ + sub v18.4S, v22.4S, v18.4S // ..........................e............................................................................................................................................................................................................................................................. + cmge v22.4S, v24.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + mls v16.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + sub v23.4S, v23.4S, v22.4S // ..............................................................................................................................................................................................*......................................................................................... + mul v27.4S, v18.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... + mls v21.4S, v20.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + add v22.4S, v12.4S, v11.4S // .......................................................................................................................................................................*................................................................................................................ + add v20.4S, v13.4S, v19.4S // ..................................................................................................................................................................*..................................................................................................................... + sub v12.4S, v12.4S, v11.4S // ......................................................................................................................................................................*................................................................................................................. + str q21, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + mul v21.4S, v20.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + mul v11.4S, v12.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + str q16, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + sqrdmulh v20.4S, v20.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + mul v16.4S, v14.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v12.4S, v12.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + sub v13.4S, v13.4S, v19.4S // .................................................................................................................................................................*...................................................................................................................... + mls v21.4S, v20.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + sqrdmulh v20.4S, v14.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + mul v14.4S, v15.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sqrdmulh v19.4S, v28.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + sqrdmulh v18.4S, v18.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... + mls v16.4S, v20.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + cmge v20.4S, v31.4S, v10.4S // ........................................................................................................................................................................................................................................................*............................... + mls v24.4S, v23.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + ldr q23, [x1, #576] // .........e.............................................................................................................................................................................................................................................................................. + mls v11.4S, v12.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + str q24, [x1, #688] // ...................................................................................................................................................................................................................*.................................................................... + mul v24.4S, v28.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + cmge v28.4S, v10.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + sqrdmulh v12.4S, v13.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + sub v20.4S, v20.4S, v28.4S // ..........................................................................................................................................................................................................................................................*............................. + mls v24.4S, v19.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + mul v19.4S, v13.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + mls v10.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + sqrdmulh v20.4S, v22.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + cmge v28.4S, v31.4S, v24.4S // ............................................................................................................................................................................................................................................................*........................... + cmge v13.4S, v24.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + mls v19.4S, v12.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + sub v12.4S, v28.4S, v13.4S // ..............................................................................................................................................................................................................................................................*......................... + sqrdmulh v15.4S, v15.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + cmge v13.4S, v16.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + mls v24.4S, v12.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + cmge v28.4S, v31.4S, v19.4S // ....................................................................................................................................................................................................*................................................................................... + mls v14.4S, v15.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + cmge v15.4S, v19.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + str q24, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + sub v15.4S, v28.4S, v15.4S // ......................................................................................................................................................................................................*................................................................................. + cmge v24.4S, v17.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + cmge v28.4S, v31.4S, v17.4S // ................................................................................................................................................................................................*....................................................................................... + mls v19.4S, v15.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + sub v12.4S, v28.4S, v24.4S // ..................................................................................................................................................................................................*..................................................................................... + cmge v15.4S, v31.4S, v14.4S // ............................................................................................................................................................................................................*........................................................................... + ldr q28, [x1, #448] // .......e................................................................................................................................................................................................................................................................................ + ldr q24, [x1, #384] // ......e................................................................................................................................................................................................................................................................................. + str q10, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + mls v17.4S, v12.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + add v10.4S, v24.4S, v28.4S // ................................e....................................................................................................................................................................................................................................................... + mls v27.4S, v18.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... + sub v24.4S, v24.4S, v28.4S // ...............................e........................................................................................................................................................................................................................................................ + str q17, [x1, #752] // ....................................................................................................................................................................................................................*................................................................... + add v17.4S, v9.4S, v10.4S // ...................................................................e.................................................................................................................................................................................................................... + mul v18.4S, v24.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... + sub v9.4S, v9.4S, v10.4S // ..................................................................e..................................................................................................................................................................................................................... + ldr q10, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... + mul v12.4S, v22.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + str q19, [x1, #816] // .....................................................................................................................................................................................................................*.................................................................. + ldr q19, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... + sqrdmulh v28.4S, v24.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... + cmge v22.4S, v31.4S, v16.4S // ........................................................................................................................................................................................*............................................................................................... + add v24.4S, v19.4S, v10.4S // ......................e................................................................................................................................................................................................................................................................. + sub v13.4S, v22.4S, v13.4S // ..........................................................................................................................................................................................*............................................................................................. + mls v18.4S, v28.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... + mls v12.4S, v20.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + mls v16.4S, v13.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + cmge v20.4S, v31.4S, v8.4S // ................................................................................................................................................................................................................................................................*....................... + sub v22.4S, v27.4S, v18.4S // .......................................................................e................................................................................................................................................................................................................ + add v13.4S, v27.4S, v18.4S // ........................................................................e............................................................................................................................................................................................................... + sub v27.4S, v19.4S, v10.4S // .....................e.................................................................................................................................................................................................................................................................. + cmge v10.4S, v14.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + ldr q28, [x1, #0] // e....................................................................................................................................................................................................................................................................................... + ldr q19, [x1, #64] // .e...................................................................................................................................................................................................................................................................................... + sub v18.4S, v15.4S, v10.4S // ..............................................................................................................................................................................................................*......................................................................... + sqrdmulh v15.4S, v27.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... + sub v10.4S, v28.4S, v19.4S // ................e....................................................................................................................................................................................................................................................................... + add v28.4S, v28.4S, v19.4S // .................e...................................................................................................................................................................................................................................................................... + mul v19.4S, v27.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ + mul v27.4S, v10.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... + sqrdmulh v10.4S, v10.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... + mls v14.4S, v18.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + mls v19.4S, v15.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. + str q16, [x1, #624] // ..................................................................................................................................................................................................................*..................................................................... + mls v27.4S, v10.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... + sub v16.4S, v28.4S, v24.4S // ........................................................e............................................................................................................................................................................................................................... + add v24.4S, v28.4S, v24.4S // .........................................................e.............................................................................................................................................................................................................................. + cmge v18.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + add v28.4S, v27.4S, v19.4S // ..............................................................e......................................................................................................................................................................................................................... + sub v10.4S, v27.4S, v19.4S // .............................................................e.......................................................................................................................................................................................................................... + cmge v19.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + add v27.4S, v28.4S, v13.4S // ......................................................................................................e................................................................................................................................................................................. + sub v28.4S, v28.4S, v13.4S // .....................................................................................................e.................................................................................................................................................................................. + sub v13.4S, v18.4S, v19.4S // ......................................................................................................................................................................................................................................................................*................. + cmge v15.4S, v31.4S, v11.4S // ........................................................................................................................................................................................................*............................................................................... + str q14, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ + ldr q14, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... + cmge v18.4S, v11.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + ldr q19, [x1, #512] // ........e............................................................................................................................................................................................................................................................................... + sub v18.4S, v15.4S, v18.4S // ..........................................................................................................................................................................................................*............................................................................. + mul v15.4S, v9.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... + sqrdmulh v9.4S, v9.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. + mls v11.4S, v18.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + sub v18.4S, v19.4S, v23.4S // ....................................e................................................................................................................................................................................................................................................... + add v23.4S, v19.4S, v23.4S // .....................................e.................................................................................................................................................................................................................................................. + mls v15.4S, v9.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. + cmge v19.4S, v8.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + str q11, [x1, #880] // ......................................................................................................................................................................................................................*................................................................. + mul v11.4S, v22.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. + sqrdmulh v22.4S, v22.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. + sub v9.4S, v20.4S, v19.4S // ..................................................................................................................................................................................................................................................................*..................... + mul v19.4S, v10.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ + sqrdmulh v20.4S, v10.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... + mls v8.4S, v9.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + sqrdmulh v9.4S, v18.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ + sub v10.4S, v24.4S, v17.4S // ................................................................................................e....................................................................................................................................................................................... + mls v11.4S, v22.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ + str q8, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + add v8.4S, v24.4S, v17.4S // .................................................................................................e...................................................................................................................................................................................... + mul v24.4S, v18.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. + cmge v18.4S, v31.4S, v12.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v17.4S, v12.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + mls v19.4S, v20.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... + sub v20.4S, v18.4S, v17.4S // ..........................................................................................................................................................................................................................................................................*............. + ldr q18, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ + ldr q17, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. + mul v22.4S, v16.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. + mls v12.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + add v20.4S, v17.4S, v18.4S // ..........................................e............................................................................................................................................................................................................................................. + sub v17.4S, v17.4S, v18.4S // .........................................e.............................................................................................................................................................................................................................................. + str q12, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + sqrdmulh v12.4S, v16.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ + sqrdmulh v18.4S, v17.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... + mul v17.4S, v17.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ + mls v24.4S, v9.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... + add v9.4S, v23.4S, v20.4S // .............................................................................e.......................................................................................................................................................................................................... + sub v16.4S, v23.4S, v20.4S // ............................................................................e........................................................................................................................................................................................................... + mls v17.4S, v18.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... + ldr q20, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... + ldr q23, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ + sub v18.4S, v24.4S, v17.4S // .................................................................................e...................................................................................................................................................................................................... + add v17.4S, v24.4S, v17.4S // ..................................................................................e..................................................................................................................................................................................................... + ldr q24, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... + mls v22.4S, v12.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... + mls v21.4S, v13.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + sub v12.4S, v24.4S, v23.4S // ...................................................e.................................................................................................................................................................................................................................... + add v13.4S, v24.4S, v23.4S // ....................................................e................................................................................................................................................................................................................................... + add v23.4S, v20.4S, v14.4S // ...............................................e........................................................................................................................................................................................................................................ + str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + sqrdmulh v24.4S, v12.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. + mul v12.4S, v12.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. + sub v21.4S, v23.4S, v13.4S // ......................................................................................e................................................................................................................................................................................................. + add v23.4S, v23.4S, v13.4S // .......................................................................................e................................................................................................................................................................................................ + sub v20.4S, v20.4S, v14.4S // ..............................................e......................................................................................................................................................................................................................................... + mul v13.4S, v28.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ + mls v12.4S, v24.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ + mul v14.4S, v16.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... + sqrdmulh v24.4S, v16.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ + sqrdmulh v16.4S, v28.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... + sub v28.4S, v9.4S, v23.4S // ....................................................................................................................e................................................................................................................................................................... + add v9.4S, v9.4S, v23.4S // .....................................................................................................................e.................................................................................................................................................................. + mls v14.4S, v24.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... + sqrdmulh v24.4S, v28.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ + sqrdmulh v23.4S, v10.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... + mls v13.4S, v16.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. + mul v28.4S, v28.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. + mul v16.4S, v10.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... + add v10.4S, v22.4S, v15.4S // ...........................................................................................................e............................................................................................................................................................................ + sub v22.4S, v22.4S, v15.4S // ..........................................................................................................e............................................................................................................................................................................. + sub v15.4S, v8.4S, v9.4S // ........................................................................................................................................e............................................................................................................................................... + mls v28.4S, v24.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... + mls v16.4S, v23.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... + mul v24.4S, v15.4S, v0.S[0] // ..........................................................................................................................................e............................................................................................................................................. + sqrdmulh v23.4S, v15.4S, v0.S[1] // ...........................................................................................................................................e............................................................................................................................................ + sub v15.4S, v19.4S, v11.4S // ...............................................................................................................e........................................................................................................................................................................ + add v8.4S, v8.4S, v9.4S // .........................................................................................................................................e.............................................................................................................................................. + sub v9.4S, v16.4S, v28.4S // ............................................................................................................................................................e........................................................................................................................... + mls v24.4S, v23.4S, v29.4S // ............................................................................................................................................e........................................................................................................................................... + add v23.4S, v16.4S, v28.4S // .............................................................................................................................................................e.......................................................................................................................... + add v28.4S, v19.4S, v11.4S // ................................................................................................................e....................................................................................................................................................................... + sqrdmulh v16.4S, v15.4S, v0.S[3] // ..................................................................................................................e..................................................................................................................................................................... + cmge v19.4S, v31.4S, v24.4S // ................................................................................................................................................................................e....................................................................................................... + cmge v11.4S, v24.4S, v30.4S // .................................................................................................................................................................................e...................................................................................................... + mul v15.4S, v15.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... + sub v19.4S, v19.4S, v11.4S // ..................................................................................................................................................................................e..................................................................................................... + sqrdmulh v11.4S, v20.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... + mul v20.4S, v20.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... + mls v15.4S, v16.4S, v29.4S // ...................................................................................................................e.................................................................................................................................................................... + sqrdmulh v16.4S, v22.4S, v0.S[3] // .............................................................................................................e.......................................................................................................................................................................... + mls v24.4S, v19.4S, v29.4S // ...................................................................................................................................................................................e.................................................................................................... + mls v20.4S, v11.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... + sqrdmulh v11.4S, v21.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. + mul v21.4S, v21.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... + str q24, [x1, #512] // ................................................................................................................................................................................................................e....................................................................... + add v19.4S, v20.4S, v12.4S // ............................................................................................e........................................................................................................................................................................................... + sub v20.4S, v20.4S, v12.4S // ...........................................................................................e............................................................................................................................................................................................ + mul v12.4S, v22.4S, v0.S[2] // ............................................................................................................e........................................................................................................................................................................... + mls v21.4S, v11.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. + sub v24.4S, v17.4S, v19.4S // .........................................................................................................................e.............................................................................................................................................................. + mul v11.4S, v20.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... + add v17.4S, v17.4S, v19.4S // ..........................................................................................................................e............................................................................................................................................................. + mul v19.4S, v24.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + sqrdmulh v22.4S, v24.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... + sub v24.4S, v27.4S, v17.4S // .............................................................................................................................................e.......................................................................................................................................... + add v27.4S, v27.4S, v17.4S // ..............................................................................................................................................e......................................................................................................................................... + mul v17.4S, v9.4S, v0.S[0] // ..............................................................................................................................................................e......................................................................................................................... + sqrdmulh v20.4S, v20.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... + mls v19.4S, v22.4S, v29.4S // .............................................................................................................................e.......................................................................................................................................................... + sqrdmulh v22.4S, v9.4S, v0.S[1] // ...............................................................................................................................................................e........................................................................................................................ + mul v9.4S, v24.4S, v0.S[0] // ...............................................................................................................................................e........................................................................................................................................ + mls v11.4S, v20.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ + sqrdmulh v20.4S, v24.4S, v0.S[1] // ................................................................................................................................................e....................................................................................................................................... + + // original source code + // ldr q8, [x1, #0] // .....................................................................................e...........................................................................................................................................|...........................................................................................................................................e............................................................................ + // ldr q9, [x1, #(1*(512/8))] // ......................................................................................e..........................................................................................................................................|............................................................................................................................................e........................................................................... + // ldr q10, [x1, #(2*(512/8))] // ........................................................................e........................................................................................................................................................|..............................................................................................................................e......................................................................................... + // ldr q11, [x1, #(3*(512/8))] // .....................................................................e...........................................................................................................................................................|...........................................................................................................................e............................................................................................ + // ldr q12, [x1, #(4*(512/8))] // .e...............................................................................................................................................................................................................................|.......................................................e................................................................................................................................................................ + // ldr q13, [x1, #(5*(512/8))] // e................................................................................................................................................................................................................................|......................................................e................................................................................................................................................................. + // ldr q14, [x1, #(6*(512/8))] // ...........................................................e.....................................................................................................................................................................|.................................................................................................................e...................................................................................................... + // ldr q15, [x1, #(7*(512/8))] // ..........................................................e......................................................................................................................................................................|................................................................................................................e....................................................................................................... + // ldr q16, [x1, #(8*(512/8))] // ...............................................................................................................e.................................................................................................................|.....................................................................................................................................................................e.................................................. + // ldr q17, [x1, #(9*(512/8))] // ..............................e..................................................................................................................................................................................................|....................................................................................e................................................................................................................................... + // ldr q18, [x1, #(10*(512/8))] // ..........................................................................................................................................e......................................................................................|................................................................................................................................................................................................e....................... + // ldr q19, [x1, #(11*(512/8))] // .........................................................................................................................................e.......................................................................................|...............................................................................................................................................................................................e........................ + // ldr q20, [x1, #(12*(512/8))] // .......................................................................................................................................................e.........................................................................|.............................................................................................................................................................................................................e.......... + // ldr q21, [x1, #(13*(512/8))] // .............................................................................................................e...................................................................................................................|...................................................................................................................................................................e.................................................... + // ldr q22, [x1, #(14*(512/8))] // ...........................................................................................................................................................e.....................................................................|.................................................................................................................................................................................................................e...... + // ldr q23, [x1, #(15*(512/8))] // ........................................................................................................................................................e........................................................................|..............................................................................................................................................................................................................e......... + // sub v24.4s, v8.4s, v9.4s // .........................................................................................e.......................................................................................................................................|...............................................................................................................................................e........................................................................ + // add v8.4s, v8.4s, v9.4s // ..........................................................................................e......................................................................................................................................|................................................................................................................................................e....................................................................... + // mul v9.4s, v24.4s, v3.s[2] // ............................................................................................e....................................................................................................................................|..................................................................................................................................................e..................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[3] // .............................................................................................e...................................................................................................................................|...................................................................................................................................................e.................................................................... + // mls v9.4s, v24.4s, v29.4s // .................................................................................................e...............................................................................................................................|.......................................................................................................................................................e................................................................ + // sub v24.4s, v10.4s, v11.4s // ...................................................................................e.............................................................................................................................................|.........................................................................................................................................e.............................................................................. + // add v10.4s, v10.4s, v11.4s // ...........................................................................e.....................................................................................................................................................|.................................................................................................................................e...................................................................................... + // mul v11.4s, v24.4s, v4.s[0] // ...........................................................................................e.....................................................................................................................................|.................................................................................................................................................e...................................................................... + // sqrdmulh v24.4s, v24.4s, v4.s[1] // ........................................................................................e........................................................................................................................................|..............................................................................................................................................e......................................................................... + // mls v11.4s, v24.4s, v29.4s // ...............................................................................................e.................................................................................................................................|.....................................................................................................................................................e.................................................................. + // sub v24.4s, v12.4s, v13.4s // .....e...........................................................................................................................................................................................................................|...........................................................e............................................................................................................................................................ + // add v12.4s, v12.4s, v13.4s // ....e............................................................................................................................................................................................................................|..........................................................e............................................................................................................................................................. + // mul v13.4s, v24.4s, v4.s[2] // .........e.......................................................................................................................................................................................................................|...............................................................e........................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v4.s[3] // ..........................e......................................................................................................................................................................................................|................................................................................e....................................................................................................................................... + // mls v13.4s, v24.4s, v29.4s // ...............................................................e.................................................................................................................................................................|.....................................................................................................................e.................................................................................................. + // sub v24.4s, v14.4s, v15.4s // ................................................................e................................................................................................................................................................|......................................................................................................................e................................................................................................. + // add v14.4s, v14.4s, v15.4s // ..............................................................e..................................................................................................................................................................|....................................................................................................................e................................................................................................... + // mul v15.4s, v24.4s, v5.s[0] // ...................................................................e.............................................................................................................................................................|.........................................................................................................................e.............................................................................................. + // sqrdmulh v24.4s, v24.4s, v5.s[1] // .........................................................................e.......................................................................................................................................................|...............................................................................................................................e........................................................................................ + // mls v15.4s, v24.4s, v29.4s // .............................................................................e...................................................................................................................................................|...................................................................................................................................e.................................................................................... + // sub v24.4s, v16.4s, v17.4s // ....................................................................................................................e............................................................................................................|..........................................................................................................................................................................e............................................. + // add v16.4s, v16.4s, v17.4s // .....................................................................................................................e...........................................................................................................|...........................................................................................................................................................................e............................................ + // mul v17.4s, v24.4s, v5.s[2] // ....................................................................................................................................e............................................................................................|..........................................................................................................................................................................................e............................. + // sqrdmulh v24.4s, v24.4s, v5.s[3] // ...............................................................................................................................e.................................................................................................|.....................................................................................................................................................................................e.................................. + // mls v17.4s, v24.4s, v29.4s // ...................................................................................................................................................e.............................................................................|.........................................................................................................................................................................................................e.............. + // sub v24.4s, v18.4s, v19.4s // ..............................................................................................................................................e..................................................................................|....................................................................................................................................................................................................e................... + // add v18.4s, v18.4s, v19.4s // .............................................................................................................................................e...................................................................................|...................................................................................................................................................................................................e.................... + // mul v19.4s, v24.4s, v6.s[0] // ..................................................................................................................................................e..............................................................................|........................................................................................................................................................................................................e............... + // sqrdmulh v24.4s, v24.4s, v6.s[1] // .................................................................................................................................................e...............................................................................|.......................................................................................................................................................................................................e................ + // mls v19.4s, v24.4s, v29.4s // ......................................................................................................................................................e..........................................................................|............................................................................................................................................................................................................e........... + // sub v24.4s, v20.4s, v21.4s // ......................................................................................................................................................................e..........................................................|........................................................................................................................................................................................................................ + // add v20.4s, v20.4s, v21.4s // ................................................................................................................................................................e................................................................|......................................................................................................................................................................................................................e. + // mul v21.4s, v24.4s, v6.s[2] // .......................................................................................................................................................................................................e.........................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v6.s[3] // ......................................................................................................................................................................................................e..........................|........................................................................................................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // ...........................................................................................................................................................................................................e.....................|........................................................................................................................................................................................................................ + // sub v24.4s, v22.4s, v23.4s // ..............................................................................................................................................................e..................................................................|....................................................................................................................................................................................................................e... + // add v22.4s, v22.4s, v23.4s // ...............................................................................................................................................................e.................................................................|.....................................................................................................................................................................................................................e.. + // mul v23.4s, v24.4s, v7.s[0] // ...................................................................................................................................................................e.............................................................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v7.s[1] // ..................................................................................................................................................................e..............................................................|........................................................................................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // ........................................................................................................................................................................e........................................................|........................................................................................................................................................................................................................ + // sub v24.4s, v8.4s, v10.4s // ..................................................................................................e..............................................................................................................................|........................................................................................................................................................e............................................................... + // add v8.4s, v8.4s, v10.4s // ...................................................................................................e.............................................................................................................................|.........................................................................................................................................................e.............................................................. + // mul v10.4s, v24.4s, v1.s[2] // ...........................................................................................................................................e.....................................................................................|.................................................................................................................................................................................................e...................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ................................................................................................................................................e................................................................................|......................................................................................................................................................................................................e................. + // mls v10.4s, v24.4s, v29.4s // ............................................................................................................................................................e....................................................................|..................................................................................................................................................................................................................e..... + // sub v24.4s, v9.4s, v11.4s // ......................................................................................................e..........................................................................................................................|............................................................................................................................................................e........................................................... + // add v9.4s, v9.4s, v11.4s // .....................................................................................................e...........................................................................................................................|...........................................................................................................................................................e............................................................ + // mul v11.4s, v24.4s, v1.s[2] // ............................................................................................................................e....................................................................................................|..................................................................................................................................................................................e..................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .............................................................................................................................e...................................................................................................|...................................................................................................................................................................................e.................................... + // mls v11.4s, v24.4s, v29.4s // .......................................................................................................................................e.........................................................................................|.............................................................................................................................................................................................e.......................... + // sub v24.4s, v12.4s, v14.4s // ....................................................................e............................................................................................................................................................|..........................................................................................................................e............................................................................................. + // add v12.4s, v12.4s, v14.4s // ..................................................................e..............................................................................................................................................................|........................................................................................................................e............................................................................................... + // mul v14.4s, v24.4s, v2.s[0] // .................................................................................................................e...............................................................................................................|.......................................................................................................................................................................e................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..................................................................................................................e..............................................................................................................|........................................................................................................................................................................e............................................... + // mls v14.4s, v24.4s, v29.4s // ......................................................................................................................e..........................................................................................................|............................................................................................................................................................................e........................................... + // sub v24.4s, v13.4s, v15.4s // .................................................................................e...............................................................................................................................................|.......................................................................................................................................e................................................................................ + // add v13.4s, v13.4s, v15.4s // ..................................................................................e..............................................................................................................................................|........................................................................................................................................e............................................................................... + // mul v15.4s, v24.4s, v2.s[0] // .........................................................................................................................e.......................................................................................................|...............................................................................................................................................................................e........................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........................................................................................................................e......................................................................................................|................................................................................................................................................................................e....................................... + // mls v15.4s, v24.4s, v29.4s // .................................................................................................................................e...............................................................................................|.......................................................................................................................................................................................e................................ + // sub v24.4s, v16.4s, v18.4s // .....................................................................................................................................................e...........................................................................|...........................................................................................................................................................................................................e............ + // add v16.4s, v16.4s, v18.4s // ....................................................................................................................................................e............................................................................|..........................................................................................................................................................................................................e............. + // mul v18.4s, v24.4s, v2.s[2] // .........................................................................................................................................................................e.......................................................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..........................................................................................................................................................................e......................................................|........................................................................................................................................................................................................................ + // mls v18.4s, v24.4s, v29.4s // ..............................................................................................................................................................................e..................................................|........................................................................................................................................................................................................................ + // sub v24.4s, v17.4s, v19.4s // .........................................................................................................................................................e.......................................................................|...............................................................................................................................................................................................................e........ + // add v17.4s, v17.4s, v19.4s // ..........................................................................................................................................................e......................................................................|................................................................................................................................................................................................................e....... + // mul v19.4s, v24.4s, v2.s[2] // .................................................................................................................................................................................................................................*........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // .................................................................................................................................................................................................................................|*....................................................................................................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // .................................................................................................................................................................................................................................|....*................................................................................................................................................................................................................... + // sub v24.4s, v20.4s, v22.4s // ....................................................................................................................................................................e............................................................|........................................................................................................................................................................................................................ + // add v20.4s, v20.4s, v22.4s // .....................................................................................................................................................................e...........................................................|........................................................................................................................................................................................................................ + // mul v22.4s, v24.4s, v3.s[0] // .............................................................................................................................................................................................................e...................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ............................................................................................................................................................................................................e....................|........................................................................................................................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................................e..............|........................................................................................................................................................................................................................ + // sub v24.4s, v21.4s, v23.4s // ................................................................................................................................................................................................................e................|........................................................................................................................................................................................................................ + // add v21.4s, v21.4s, v23.4s // ...............................................................................................................................................................................................................e.................|........................................................................................................................................................................................................................ + // mul v23.4s, v24.4s, v3.s[0] // ....................................................................................................................................................................................................................e............|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...........................................................................................................................................................................................................................e.....|........................................................................................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // ...............................................................................................................................................................................................................................e.|........................................................................................................................................................................................................................ + // sub v24.4s, v8.4s, v12.4s // ................................................................................................................................e................................................................................................|......................................................................................................................................................................................e................................. + // add v8.4s, v8.4s, v12.4s // ...................................................................................................................................e.............................................................................................|.........................................................................................................................................................................................e.............................. + // mul v12.4s, v24.4s, v0.s[2] // ...................................................................................................................................................................................e.............................................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................................................................................................................................................................................e................................................|........................................................................................................................................................................................................................ + // mls v12.4s, v24.4s, v29.4s // ........................................................................................................................................................................................e........................................|........................................................................................................................................................................................................................ + // sub v24.4s, v9.4s, v13.4s // .........................................................................................................e.......................................................................................................................|...............................................................................................................................................................e........................................................ + // add v9.4s, v9.4s, v13.4s // ........................................................................................................e........................................................................................................................|..............................................................................................................................................................e......................................................... + // mul v13.4s, v24.4s, v0.s[2] // .......................................................................................................................................................................e.........................................................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................................................................................................................................................................e.....................................................|........................................................................................................................................................................................................................ + // mls v13.4s, v24.4s, v29.4s // .................................................................................................................................................................................e...............................................|........................................................................................................................................................................................................................ + // sub v24.4s, v10.4s, v14.4s // .....................................................................................................................................................................................e...........................................|........................................................................................................................................................................................................................ + // add v10.4s, v10.4s, v14.4s // ....................................................................................................................................................................................e............................................|........................................................................................................................................................................................................................ + // mul v14.4s, v24.4s, v0.s[2] // .................................................................................................................................................................................................................e...............|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................................................................................................................................................................e.......................|........................................................................................................................................................................................................................ + // mls v14.4s, v24.4s, v29.4s // .................................................................................................................................................................................................................................|.*...................................................................................................................................................................................................................... + // sub v24.4s, v11.4s, v15.4s // ...........................................................................................................................................................................................e.....................................|........................................................................................................................................................................................................................ + // add v11.4s, v11.4s, v15.4s // ................................................................................................................................................................................................e................................|........................................................................................................................................................................................................................ + // mul v15.4s, v24.4s, v0.s[2] // ....................................................................................................................................................................................................e............................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .................................................................................................................................................................................................e...............................|........................................................................................................................................................................................................................ + // mls v15.4s, v24.4s, v29.4s // ........................................................................................................................................................................................................e........................|........................................................................................................................................................................................................................ + // sub v24.4s, v16.4s, v20.4s // ............................................................................................................................................................................e....................................................|........................................................................................................................................................................................................................ + // add v16.4s, v16.4s, v20.4s // .............................................................................................................................................................................e...................................................|........................................................................................................................................................................................................................ + // mul v20.4s, v24.4s, v1.s[0] // ..................................................................................................................................................................................e..............................................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...............................................................................................................................................................................e.................................................|........................................................................................................................................................................................................................ + // mls v20.4s, v24.4s, v29.4s // .......................................................................................................................................................................................e.........................................|........................................................................................................................................................................................................................ + // sub v24.4s, v17.4s, v21.4s // ...................................................................................................................................................................................................................e.............|........................................................................................................................................................................................................................ + // add v17.4s, v17.4s, v21.4s // .....................................................................................................................................................................................................................e...........|........................................................................................................................................................................................................................ + // mul v21.4s, v24.4s, v1.s[0] // ......................................................................................................................................................................................................................e..........|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .......................................................................................................................................................................................................................e.........|........................................................................................................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................e....|........................................................................................................................................................................................................................ + // sub v24.4s, v18.4s, v22.4s // .................................................................................................................................................................................................................................|..........*............................................................................................................................................................................................................. + // add v18.4s, v18.4s, v22.4s // .................................................................................................................................................................................................................................|.....*.................................................................................................................................................................................................................. + // mul v22.4s, v24.4s, v1.s[0] // .................................................................................................................................................................................................................................|..................*..................................................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .................................................................................................................................................................................................................................|......................*................................................................................................................................................................................................. + // mls v22.4s, v24.4s, v29.4s // .................................................................................................................................................................................................................................|..........................*............................................................................................................................................................................................. + // sub v24.4s, v19.4s, v23.4s // .................................................................................................................................................................................................................................|........*............................................................................................................................................................................................................... + // add v19.4s, v19.4s, v23.4s // .................................................................................................................................................................................................................................|.........*.............................................................................................................................................................................................................. + // mul v23.4s, v24.4s, v1.s[0] // .................................................................................................................................................................................................................................|...........*............................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .................................................................................................................................................................................................................................|............*........................................................................................................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // .................................................................................................................................................................................................................................|................*....................................................................................................................................................................................................... + // sub v24.4s, v8.4s, v16.4s // ......................................................................................................................................................................................e..........................................|........................................................................................................................................................................................................................ + // add v8.4s, v8.4s, v16.4s // ............................................................................................................................................................................................e....................................|........................................................................................................................................................................................................................ + // mul v16.4s, v24.4s, v0.s[0] // .........................................................................................................................................................................................e.......................................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........................................................................................................................................................................................e......................................|........................................................................................................................................................................................................................ + // mls v16.4s, v24.4s, v29.4s // ..............................................................................................................................................................................................e..................................|........................................................................................................................................................................................................................ + // sub v24.4s, v9.4s, v17.4s // ........................................................................................................................................................................................................................e........|........................................................................................................................................................................................................................ + // add v9.4s, v9.4s, v17.4s // .........................................................................................................................................................................................................................e.......|........................................................................................................................................................................................................................ + // mul v17.4s, v24.4s, v0.s[0] // ..............................................................................................................................................................................................................................e..|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................................................................................................................................................................................................e|........................................................................................................................................................................................................................ + // mls v17.4s, v24.4s, v29.4s // .................................................................................................................................................................................................................................|......*................................................................................................................................................................................................................. + // sub v24.4s, v10.4s, v18.4s // .................................................................................................................................................................................................................................|.........................*.............................................................................................................................................................................................. + // add v10.4s, v10.4s, v18.4s // .................................................................................................................................................................................................................................|.................................*...................................................................................................................................................................................... + // mul v18.4s, v24.4s, v0.s[0] // ...................*.............................................................................................................................................................................................................|.........................................................................*.............................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................*.........................................................................................................................................................................................................|.............................................................................*.......................................................................................................................................... + // mls v18.4s, v24.4s, v29.4s // ...........................*.....................................................................................................................................................................................................|.................................................................................*...................................................................................................................................... + // sub v24.4s, v11.4s, v19.4s // .................................................................................................................................................................................................................................|.............*.......................................................................................................................................................................................................... + // add v11.4s, v11.4s, v19.4s // .................................................................................................................................................................................................................................|..............*......................................................................................................................................................................................................... + // mul v19.4s, v24.4s, v0.s[0] // .................................................................................................................................................................................................................................|.......................*................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................................................................................................................................................................................................................|..............................*......................................................................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // .................................................................................................................................................................................................................................|...................................*.................................................................................................................................................................................... + // sub v24.4s, v12.4s, v20.4s // .............................................................................................................................................................................................e...................................|........................................................................................................................................................................................................................ + // add v12.4s, v12.4s, v20.4s // ...............................................................................................................................................................................................e.................................|........................................................................................................................................................................................................................ + // mul v20.4s, v24.4s, v0.s[0] // ..........................................................................................................................................................................................................................e......|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................................................................................................................................................................................e...|........................................................................................................................................................................................................................ + // mls v20.4s, v24.4s, v29.4s // .................................................................................................................................................................................................................................|...........................................*............................................................................................................................................................................ + // sub v24.4s, v13.4s, v21.4s // .....................*...........................................................................................................................................................................................................|...........................................................................*............................................................................................................................................ + // add v13.4s, v13.4s, v21.4s // ............*....................................................................................................................................................................................................................|..................................................................*..................................................................................................................................................... + // mul v21.4s, v24.4s, v0.s[0] // ......................................*..........................................................................................................................................................................................|............................................................................................*........................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................*.............................................................................................................................................................................................|.........................................................................................*.............................................................................................................................. + // mls v21.4s, v24.4s, v29.4s // ...........................................*.....................................................................................................................................................................................|.................................................................................................*...................................................................................................................... + // sub v24.4s, v14.4s, v22.4s // .............*...................................................................................................................................................................................................................|...................................................................*.................................................................................................................................................... + // add v14.4s, v14.4s, v22.4s // ...........*.....................................................................................................................................................................................................................|.................................................................*...................................................................................................................................................... + // mul v22.4s, v24.4s, v0.s[0] // ................*................................................................................................................................................................................................................|......................................................................*................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ....................*............................................................................................................................................................................................................|..........................................................................*............................................................................................................................................. + // mls v22.4s, v24.4s, v29.4s // ...............................*.................................................................................................................................................................................................|.....................................................................................*.................................................................................................................................. + // sub v24.4s, v15.4s, v23.4s // .................................................................................................................................................................................................................................|.....................*.................................................................................................................................................................................................. + // add v15.4s, v15.4s, v23.4s // .................................................................................................................................................................................................................................|....................*................................................................................................................................................................................................... + // mul v23.4s, v24.4s, v0.s[0] // ........................*........................................................................................................................................................................................................|..............................................................................*......................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................*...................................................................................................................................................................................|...................................................................................................*.................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // .................................................*...............................................................................................................................................................................|.......................................................................................................*................................................................................................................ + // cmge v27.4s, v31.4s, v16.4s // ..................................................................................................................................................................................................e..............................|........................................................................................................................................................................................................................ + // cmge v28.4s, v16.4s, v30.4s // ...................................................................................................................................................................................................e.............................|........................................................................................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // .....................................................................................................................................................................................................e...........................|........................................................................................................................................................................................................................ + // mls v16.4s, v28.4s, v29.4s // ..........................................................................................................................................................................................................e......................|........................................................................................................................................................................................................................ + // cmge v27.4s, v31.4s, v17.4s // .................................................................................................................................................................................................................................|.............................*.......................................................................................................................................................................................... + // cmge v28.4s, v17.4s, v30.4s // .................................................................................................................................................................................................................................|...........................*............................................................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // .................................................................................................................................................................................................................................|...............................*........................................................................................................................................................................................ + // mls v17.4s, v28.4s, v29.4s // .................................................................................................................................................................................................................................|..................................*..................................................................................................................................................................................... + // cmge v27.4s, v31.4s, v18.4s // ..........................................................................*......................................................................................................................................................|................................................................................................................................*....................................................................................... + // cmge v28.4s, v18.4s, v30.4s // ..............................................*..................................................................................................................................................................................|....................................................................................................*................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................*....................................................................................................................................................|..................................................................................................................................*..................................................................................... + // mls v18.4s, v28.4s, v29.4s // ...............................................................................*.................................................................................................................................................|.....................................................................................................................................*.................................................................................. + // cmge v27.4s, v31.4s, v19.4s // ..*..............................................................................................................................................................................................................................|........................................................*............................................................................................................................................................... + // cmge v28.4s, v19.4s, v30.4s // ......*..........................................................................................................................................................................................................................|............................................................*........................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ........*........................................................................................................................................................................................................................|..............................................................*......................................................................................................................................................... + // mls v19.4s, v28.4s, v29.4s // .............................*...................................................................................................................................................................................................|...................................................................................*.................................................................................................................................... + // cmge v27.4s, v31.4s, v20.4s // ......................................................*..........................................................................................................................................................................|............................................................................................................*........................................................................................................... + // cmge v28.4s, v20.4s, v30.4s // .....................................................*...........................................................................................................................................................................|...........................................................................................................*............................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ........................................................*........................................................................................................................................................................|..............................................................................................................*......................................................................................................... + // mls v20.4s, v28.4s, v29.4s // .............................................................*...................................................................................................................................................................|...................................................................................................................*.................................................................................................... + // cmge v27.4s, v31.4s, v21.4s // ................................................*................................................................................................................................................................................|......................................................................................................*................................................................................................................. + // cmge v28.4s, v21.4s, v30.4s // ..................................................*..............................................................................................................................................................................|........................................................................................................*............................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ....................................................*............................................................................................................................................................................|..........................................................................................................*............................................................................................................. + // mls v21.4s, v28.4s, v29.4s // .......................................................*.........................................................................................................................................................................|.............................................................................................................*.......................................................................................................... + // cmge v27.4s, v31.4s, v22.4s // ...........................................................................................................*.....................................................................................................................|.................................................................................................................................................................*...................................................... + // cmge v28.4s, v22.4s, v30.4s // ..............................................................................................................*..................................................................................................................|....................................................................................................................................................................*................................................... + // sub v28.4s, v27.4s, v28.4s // ................................................................................................................*................................................................................................................|......................................................................................................................................................................*................................................. + // mls v22.4s, v28.4s, v29.4s // ...................................................................................................................*.............................................................................................................|.........................................................................................................................................................................*.............................................. + // cmge v27.4s, v31.4s, v23.4s // .........................................................*.......................................................................................................................................................................|...............................................................................................................*........................................................................................................ + // cmge v28.4s, v23.4s, v30.4s // ....................................................................................*............................................................................................................................................|..........................................................................................................................................*............................................................................. + // sub v28.4s, v27.4s, v28.4s // .......................................................................................*.........................................................................................................................................|.............................................................................................................................................*.......................................................................... + // mls v23.4s, v28.4s, v29.4s // ..............................................................................................*..................................................................................................................................|....................................................................................................................................................*................................................................... + // str q16, [x1, #(8*(512/8))] // ..............................................................................................................................................................................................................e..................|........................................................................................................................................................................................................................ + // str q17, [x1, #(9*(512/8))] // ...*.............................................................................................................................................................................................................................|.........................................................*.............................................................................................................................................................. + // str q18, [x1, #(10*(512/8))] // ................................................................................................*................................................................................................................................|......................................................................................................................................................*................................................................. + // str q19, [x1, #(11*(512/8))] // ................................*................................................................................................................................................................................................|......................................................................................*................................................................................................................................. + // str q20, [x1, #(12*(512/8))] // .................................................................*...............................................................................................................................................................|.......................................................................................................................*................................................................................................ + // str q21, [x1, #(13*(512/8))] // .......................................................................*.........................................................................................................................................................|.............................................................................................................................*.......................................................................................... + // str q22, [x1, #(14*(512/8))] // ........................................................................................................................*........................................................................................................|..............................................................................................................................................................................*......................................... + // str q23, [x1, #(15*(512/8))] // ............................................................................................................*....................................................................................................................|..................................................................................................................................................................*..................................................... + // mul v16.4s, v8.4s, v25.4s // .................................................................................................................................................................................................................................|...*.................................................................................................................................................................................................................... + // sqrdmulh v8.4s, v8.4s, v26.4s // .................................................................................................................................................................................................................................|..*..................................................................................................................................................................................................................... + // mls v16.4s, v8.4s, v29.4s // .................................................................................................................................................................................................................................|.......*................................................................................................................................................................................................................ + // mul v17.4s, v9.4s, v25.4s // .................................................................................................................................................................................................................................|.......................................*................................................................................................................................................................................ + // sqrdmulh v9.4s, v9.4s, v26.4s // .................................................................................................................................................................................................................................|........................................*............................................................................................................................................................................... + // mls v17.4s, v9.4s, v29.4s // .................................................................................................................................................................................................................................|............................................*........................................................................................................................................................................... + // mul v18.4s, v10.4s, v25.4s // .................................................................................................................................................................................................................................|..........................................*............................................................................................................................................................................. + // sqrdmulh v10.4s, v10.4s, v26.4s // .................................................................................................................................................................................................................................|......................................*................................................................................................................................................................................. + // mls v18.4s, v10.4s, v29.4s // .................................................................................................................................................................................................................................|..............................................*......................................................................................................................................................................... + // mul v19.4s, v11.4s, v25.4s // .................................*...............................................................................................................................................................................................|.......................................................................................*................................................................................................................................ + // sqrdmulh v11.4s, v11.4s, v26.4s // .........................*.......................................................................................................................................................................................................|...............................................................................*........................................................................................................................................ + // mls v19.4s, v11.4s, v29.4s // .....................................*...........................................................................................................................................................................................|...........................................................................................*............................................................................................................................ + // mul v20.4s, v12.4s, v25.4s // .................................................................................................................................................................................................................................|.............................................*.......................................................................................................................................................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // .................................................................................................................................................................................................................................|................................*....................................................................................................................................................................................... + // mls v20.4s, v12.4s, v29.4s // .................................................................................................................................................................................................................................|..................................................*..................................................................................................................................................................... + // mul v21.4s, v13.4s, v25.4s // ...............*.................................................................................................................................................................................................................|.....................................................................*.................................................................................................................................................. + // sqrdmulh v13.4s, v13.4s, v26.4s // ..................*..............................................................................................................................................................................................................|........................................................................*............................................................................................................................................... + // mls v21.4s, v13.4s, v29.4s // ......................*..........................................................................................................................................................................................................|............................................................................*........................................................................................................................................... + // mul v22.4s, v14.4s, v25.4s // ......................................................................*..........................................................................................................................................................|............................................................................................................................*........................................................................................... + // sqrdmulh v14.4s, v14.4s, v26.4s // ........................................*........................................................................................................................................................................................|..............................................................................................*......................................................................................................................... + // mls v22.4s, v14.4s, v29.4s // ..............................................................................*..................................................................................................................................................|....................................................................................................................................*................................................................................... + // mul v23.4s, v15.4s, v25.4s // .................................................................................................................................................................................................................................|.....................................*.................................................................................................................................................................................. + // sqrdmulh v15.4s, v15.4s, v26.4s // .................................................................................................................................................................................................................................|....................................*................................................................................................................................................................................... + // mls v23.4s, v15.4s, v29.4s // .................................................................................................................................................................................................................................|.........................................*.............................................................................................................................................................................. + // cmge v27.4s, v31.4s, v16.4s // .................................................................................................................................................................................................................................|.................*...................................................................................................................................................................................................... + // cmge v28.4s, v16.4s, v30.4s // .................................................................................................................................................................................................................................|...............*........................................................................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // .................................................................................................................................................................................................................................|...................*.................................................................................................................................................................................................... + // mls v16.4s, v28.4s, v29.4s // .................................................................................................................................................................................................................................|........................*............................................................................................................................................................................................... + // cmge v27.4s, v31.4s, v17.4s // .................................................................................................................................................................................................................................|.................................................*...................................................................................................................................................................... + // cmge v28.4s, v17.4s, v30.4s // .................................................................................................................................................................................................................................|...................................................*.................................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .................................................................................................................................................................................................................................|.....................................................*.................................................................................................................................................................. + // mls v17.4s, v28.4s, v29.4s // ..........*......................................................................................................................................................................................................................|................................................................*....................................................................................................................................................... + // cmge v27.4s, v31.4s, v18.4s // ............................*....................................................................................................................................................................................................|..................................................................................*..................................................................................................................................... + // cmge v28.4s, v18.4s, v30.4s // ..................................*..............................................................................................................................................................................................|........................................................................................*............................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ....................................*............................................................................................................................................................................................|..........................................................................................*............................................................................................................................. + // mls v18.4s, v28.4s, v29.4s // .......................................*.........................................................................................................................................................................................|.............................................................................................*.......................................................................................................................... + // cmge v27.4s, v31.4s, v19.4s // .........................................*.......................................................................................................................................................................................|...............................................................................................*........................................................................................................................ + // cmge v28.4s, v19.4s, v30.4s // ..........................................*......................................................................................................................................................................................|................................................................................................*....................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................*....................................................................................................................................................................................|..................................................................................................*..................................................................................................................... + // mls v19.4s, v28.4s, v29.4s // ...............................................*.................................................................................................................................................................................|.....................................................................................................*.................................................................................................................. + // cmge v27.4s, v31.4s, v20.4s // ................................................................................*................................................................................................................................................|......................................................................................................................................*................................................................................. + // cmge v28.4s, v20.4s, v30.4s // .......................................................................................................................*.........................................................................................................|.............................................................................................................................................................................*.......................................... + // sub v28.4s, v27.4s, v28.4s // ...........................................................................................................................*.....................................................................................................|.................................................................................................................................................................................*...................................... + // mls v20.4s, v28.4s, v29.4s // ..............................................................................................................................*..................................................................................................|....................................................................................................................................................................................*................................... + // cmge v27.4s, v31.4s, v21.4s // ....................................................................................................*............................................................................................................................|..........................................................................................................................................................*............................................................. + // cmge v28.4s, v21.4s, v30.4s // .......................................................................................................*.........................................................................................................................|.............................................................................................................................................................*.......................................................... + // sub v28.4s, v27.4s, v28.4s // ..........................................................................................................*......................................................................................................................|................................................................................................................................................................*....................................................... + // mls v21.4s, v28.4s, v29.4s // .............................................................................................................................................................*...................................................................|...................................................................................................................................................................................................................*.... + // cmge v27.4s, v31.4s, v22.4s // .....................................................................................................................................*...........................................................................................|...........................................................................................................................................................................................*............................ + // cmge v28.4s, v22.4s, v30.4s // ......................................................................................................................................*..........................................................................................|............................................................................................................................................................................................*........................... + // sub v28.4s, v27.4s, v28.4s // ........................................................................................................................................*........................................................................................|..............................................................................................................................................................................................*......................... + // mls v22.4s, v28.4s, v29.4s // ............................................................................................................................................*....................................................................................|..................................................................................................................................................................................................*..................... + // cmge v27.4s, v31.4s, v23.4s // .................................................................................................................................................................................................................................|................................................*....................................................................................................................................................................... + // cmge v28.4s, v23.4s, v30.4s // .................................................................................................................................................................................................................................|...............................................*........................................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // .................................................................................................................................................................................................................................|....................................................*................................................................................................................................................................... + // mls v23.4s, v28.4s, v29.4s // .......*.........................................................................................................................................................................................................................|.............................................................*.......................................................................................................................................................... + // str q16, [x1], #(16) // .................................................................................................................................................................................................................................|............................*........................................................................................................................................................................................... + // str q17, [x1, #(-16 + 1*(512/8))] // ..............*..................................................................................................................................................................................................................|....................................................................*................................................................................................................................................... + // str q18, [x1, #(-16 + 2*(512/8))] // ............................................................*....................................................................................................................................................................|..................................................................................................................*..................................................................................................... + // str q19, [x1, #(-16 + 3*(512/8))] // ...................................................*.............................................................................................................................................................................|.........................................................................................................*.............................................................................................................. + // str q20, [x1, #(-16 + 4*(512/8))] // ..................................................................................................................................*..............................................................................................|........................................................................................................................................................................................*............................... + // str q21, [x1, #(-16 + 5*(512/8))] // .................................................................................................................................................................*...............................................................|.......................................................................................................................................................................................................................* + // str q22, [x1, #(-16 + 6*(512/8))] // ...............................................................................................................................................*.................................................................................|.....................................................................................................................................................................................................*.................. + // str q23, [x1, #(-16 + 7*(512/8))] // .................*...............................................................................................................................................................................................................|.......................................................................*................................................................................................................................................ + + sub count, count, #1 + cbnz count, layer1234_start + mls v12.4S, v16.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sqrdmulh v24.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + mul v8.4S, v18.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + sqrdmulh v18.4S, v18.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + mls v9.4S, v20.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + mls v16.4S, v24.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sqrdmulh v24.4S, v27.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mls v8.4S, v18.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + cmge v18.4S, v9.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + mls v17.4S, v22.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + mul v27.4S, v27.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + sub v22.4S, v8.4S, v11.4S // ...................................................................................................................................*.................................................................................................................................................... + add v8.4S, v8.4S, v11.4S // ....................................................................................................................................*................................................................................................................................................... + cmge v11.4S, v31.4S, v9.4S // ....................................................................................................................................................................................*................................................................................................... + mul v20.4S, v22.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + sqrdmulh v22.4S, v22.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mls v27.4S, v24.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + add v24.4S, v28.4S, v8.4S // ........................................................................................................................................................*............................................................................................................................... + sub v11.4S, v11.4S, v18.4S // ......................................................................................................................................................................................*................................................................................................. + mls v20.4S, v22.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + add v18.4S, v14.4S, v21.4S // ...............................................................................................................................*........................................................................................................................................................ + mls v9.4S, v11.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + add v22.4S, v13.4S, v19.4S // ..................................................................................................................................................................*..................................................................................................................... + add v11.4S, v15.4S, v20.4S // ............................................................................................................................................................................*........................................................................................................... + sub v15.4S, v15.4S, v20.4S // ...........................................................................................................................................................................*............................................................................................................ + str q9, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + mul v9.4S, v11.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sub v13.4S, v13.4S, v19.4S // .................................................................................................................................................................*...................................................................................................................... + sub v20.4S, v14.4S, v21.4S // ..............................................................................................................................*......................................................................................................................................................... + sub v21.4S, v28.4S, v8.4S // .......................................................................................................................................................*................................................................................................................................ + mul v8.4S, v13.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + sqrdmulh v19.4S, v13.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + sqrdmulh v14.4S, v24.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + sqrdmulh v28.4S, v15.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + mul v13.4S, v15.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + mls v8.4S, v19.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + mul v19.4S, v24.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + sqrdmulh v15.4S, v11.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + mls v13.4S, v28.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + cmge v11.4S, v8.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + cmge v28.4S, v31.4S, v8.4S // ....................................................................................................................................................................................................*................................................................................... + mls v19.4S, v14.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + sub v28.4S, v28.4S, v11.4S // ......................................................................................................................................................................................................*................................................................................. + cmge v24.4S, v31.4S, v13.4S // ............................................................................................................................................................................................................*........................................................................... + cmge v14.4S, v13.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + mls v8.4S, v28.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + sub v28.4S, v24.4S, v14.4S // ..............................................................................................................................................................................................................*......................................................................... + mls v9.4S, v15.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + cmge v14.4S, v31.4S, v17.4S // ................................................................................................................................................................................................*....................................................................................... + mls v13.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + cmge v28.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + cmge v24.4S, v9.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v15.4S, v31.4S, v9.4S // ............................................................................................................................................................................................................................................................................*........... + str q13, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + sqrdmulh v13.4S, v23.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + cmge v11.4S, v31.4S, v27.4S // ....................................................................................................................................................................................................................................................*................................... + str q8, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + cmge v8.4S, v27.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + sub v24.4S, v15.4S, v24.4S // ..............................................................................................................................................................................................................................................................................*......... + sub v15.4S, v11.4S, v8.4S // ......................................................................................................................................................................................................................................................*................................. + mul v11.4S, v20.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + sqrdmulh v20.4S, v20.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + mls v9.4S, v24.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + mls v27.4S, v15.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + sub v15.4S, v10.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... + mls v11.4S, v20.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + add v8.4S, v10.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... + str q27, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + sqrdmulh v20.4S, v15.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + sub v24.4S, v12.4S, v11.4S // ......................................................................................................................................................................*................................................................................................................. + add v12.4S, v12.4S, v11.4S // .......................................................................................................................................................................*................................................................................................................ + mul v18.4S, v15.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v27.4S, v24.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mul v11.4S, v24.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + sqrdmulh v15.4S, v12.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + mul v12.4S, v12.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + mls v18.4S, v20.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + mls v11.4S, v27.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + cmge v10.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + mls v12.4S, v15.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + cmge v27.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + cmge v24.4S, v31.4S, v11.4S // ........................................................................................................................................................................................................*............................................................................... + cmge v15.4S, v11.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + mul v20.4S, v21.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sub v28.4S, v28.4S, v10.4S // ..................................................................................................................................................................................................................................................*..................................... + cmge v10.4S, v17.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mul v23.4S, v23.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + mls v16.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + sub v28.4S, v14.4S, v10.4S // ..................................................................................................................................................................................................*..................................................................................... + cmge v14.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + mls v23.4S, v13.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + sub v13.4S, v27.4S, v14.4S // ..........................................................................................................................................................................................*............................................................................................. + mls v17.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + sqrdmulh v27.4S, v22.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + mls v18.4S, v13.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + mul v22.4S, v22.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + cmge v28.4S, v31.4S, v12.4S // ........................................................................................................................................................................................................................................................................*............... + sqrdmulh v14.4S, v8.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + mul v10.4S, v8.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + mls v22.4S, v27.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v8.4S, v23.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + cmge v27.4S, v31.4S, v23.4S // ................................................................................................................................................................................................................................................................*....................... + mls v10.4S, v14.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + cmge v14.4S, v31.4S, v22.4S // ....................................................................................................................................................................................................................................................................*................... + cmge v13.4S, v22.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + sub v27.4S, v27.4S, v8.4S // ..................................................................................................................................................................................................................................................................*..................... + sub v14.4S, v14.4S, v13.4S // ......................................................................................................................................................................................................................................................................*................. + sqrdmulh v21.4S, v21.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + cmge v8.4S, v31.4S, v10.4S // ........................................................................................................................................................................................................................................................*............................... + cmge v13.4S, v10.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + mls v22.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + sub v13.4S, v8.4S, v13.4S // ..........................................................................................................................................................................................................................................................*............................. + mls v20.4S, v21.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + sub v15.4S, v24.4S, v15.4S // ..........................................................................................................................................................................................................*............................................................................. + mls v23.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + str q22, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + cmge v24.4S, v20.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + cmge v27.4S, v31.4S, v20.4S // ............................................................................................................................................................................................*........................................................................................... + str q18, [x1, #624] // ..................................................................................................................................................................................................................*..................................................................... + sub v21.4S, v27.4S, v24.4S // ..............................................................................................................................................................................................*......................................................................................... + str q23, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + mls v11.4S, v15.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mls v20.4S, v21.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mls v10.4S, v13.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + str q9, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + cmge v9.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + str q20, [x1, #688] // ...................................................................................................................................................................................................................*.................................................................... + cmge v20.4S, v12.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + str q10, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + sub v28.4S, v28.4S, v20.4S // ..........................................................................................................................................................................................................................................................................*............. + cmge v20.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + str q17, [x1, #752] // ....................................................................................................................................................................................................................*................................................................... + sub v14.4S, v9.4S, v20.4S // ..............................................................................................................................................................................................................................................................*......................... + mls v12.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + str q11, [x1, #880] // ......................................................................................................................................................................................................................*................................................................. + mls v19.4S, v14.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + str q12, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_a72.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_a72.s new file mode 100644 index 0000000..3440ef6 --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_a72.s @@ -0,0 +1,1794 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_opt_a72 + .global _intt_dilithium_1234_5678_opt_a72 + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_opt_a72: +_intt_dilithium_1234_5678_opt_a72: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 + ld4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x0] // *................................................. + ldr q18, [x3, #48] // .*................................................ + ldr q22, [x3, #64] // ..*............................................... + ldr q26, [x3, #32] // ....*............................................. + ldr q30, [x3], #(6*16) // ...*.............................................. + // gap // .................................................. + ldr q27, [x3, #-16] // .....*............................................ + ldr q15, [x3, #-80] // ........*......................................... + // gap // .................................................. + ldr q7, [x4], #8 // ...................................*.............. + // gap // .................................................. + // gap // .................................................. + add v24.4S, v10.4S, v11.4S // ......*........................................... + sub v10.4S, v10.4S, v11.4S // .......*.......................................... + // gap // .................................................. + add v8.4S, v12.4S, v13.4S // .........*........................................ + sub v13.4S, v12.4S, v13.4S // ..........*....................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v18.4S, v10.4S, v18.4S // ...........*...................................... + // gap // .................................................. + // gap // .................................................. + sub v6.4S, v24.4S, v8.4S // ............*..................................... + // gap // .................................................. + // gap // .................................................. + mul v11.4S, v13.4S, v22.4S // .................*................................ + add v24.4S, v24.4S, v8.4S // ..............*................................... + ldr q22, [x4], #16 // ....................................*............. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v27.4S, v13.4S, v27.4S // ...............*.................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mul v10.4S, v10.4S, v26.4S // .............*.................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v10.4S, v18.4S, v29.4S // ................*................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v11.4S, v27.4S, v29.4S // ..................*............................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v18.4S, v6.4S, v15.4S // ......................*........................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mul v26.4S, v6.4S, v30.4S // ...................*.............................. + // gap // .................................................. + // gap // .................................................. + sub v27.4S, v10.4S, v11.4S // ....................*............................. + // gap // .................................................. + // gap // .................................................. + add v10.4S, v10.4S, v11.4S // .....................*............................ + // gap // .................................................. + // gap // .................................................. + mls v26.4S, v18.4S, v29.4S // ...........................*...................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v18.4S, v27.4S, v15.4S // .......................*.......................... + trn1 v13.4S, v24.4S, v10.4S // ........................*......................... + // gap // .................................................. + trn2 v10.4S, v24.4S, v10.4S // ..........................*....................... + // gap // .................................................. + // gap // .................................................. + mul v30.4S, v27.4S, v30.4S // .........................*........................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v30.4S, v18.4S, v29.4S // ............................*..................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn1 v18.4S, v26.4S, v30.4S // .............................*.................... + trn2 v26.4S, v26.4S, v30.4S // ..............................*................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn2 v27.2D, v13.2D, v18.2D // ................................*................. + trn2 v30.2D, v10.2D, v26.2D // ...............................*.................. + // gap // .................................................. + trn1 v18.2D, v13.2D, v18.2D // .................................*................ + trn1 v10.2D, v10.2D, v26.2D // ..................................*............... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v26.4S, v27.4S, v30.4S // ......................................*........... + sub v2.4S, v27.4S, v30.4S // ...............................................*.. + // gap // .................................................. + add v30.4S, v18.4S, v10.4S // .....................................*............ + sub v10.4S, v18.4S, v10.4S // .......................................*.......... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v18.4S, v30.4S, v26.4S // ........................................*......... + sub v26.4S, v30.4S, v26.4S // .........................................*........ + // gap // .................................................. + sqrdmulh v23.4S, v10.4S, v22.S[1] // ..........................................*....... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + srshr v30.4S, v18.4S, #23 // ...........................................*...... + mul v24.4S, v26.4S, v7.S[0] // ............................................*..... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v18.4S, v30.4S, v29.4S // .............................................*.... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v16.4S, v26.4S, v7.S[1] // ..............................................*... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mul v25.4S, v10.4S, v22.S[0] // ................................................*. + // gap // .................................................. + // gap // .................................................. + str q18, [x0], #(16*4) // .................................................* + // gap // .................................................. + // gap // .................................................. + + // original source code + // ld4 {v11.4S, v12.4S, v13.4S, v14.4S}, [x0] // *................................................. + // ldr q20, [x3, #48] // .*................................................ + // ldr q18, [x3, #64] // ..*............................................... + // ldr q6, [x3], #(6*16) // ....*............................................. + // ldr q0, [x3, #-64] // ...*.............................................. + // ldr q4, [x3, #-16] // .....*............................................ + // add v31.4S, v11.4S, v12.4S // ........*......................................... + // sub v12.4S, v11.4S, v12.4S // .........*........................................ + // ldr q5, [x3, #-80] // ......*........................................... + // add v17.4S, v13.4S, v14.4S // ..........*....................................... + // sub v11.4S, v13.4S, v14.4S // ...........*...................................... + // sqrdmulh v3.4S, v12.4S, v20.4S // ............*..................................... + // sub v30.4S, v31.4S, v17.4S // .............*.................................... + // mul v14.4S, v12.4S, v0.4S // ..................*............................... + // add v28.4S, v31.4S, v17.4S // ...............*.................................. + // sqrdmulh v19.4S, v11.4S, v4.4S // .................*................................ + // mls v14.4S, v3.4S, v29.4S // ...................*.............................. + // mul v13.4S, v11.4S, v18.4S // ..............*................................... + // mls v13.4S, v19.4S, v29.4S // ....................*............................. + // mul v27.4S, v30.4S, v6.4S // ......................*........................... + // sub v3.4S, v14.4S, v13.4S // .......................*.......................... + // add v2.4S, v14.4S, v13.4S // ........................*......................... + // sqrdmulh v0.4S, v30.4S, v5.4S // .....................*............................ + // sqrdmulh v10.4S, v3.4S, v5.4S // ..........................*....................... + // trn1 v30.4S, v28.4S, v2.4S // ...........................*...................... + // mul v18.4S, v3.4S, v6.4S // .............................*.................... + // trn2 v2.4S, v28.4S, v2.4S // ............................*..................... + // mls v27.4S, v0.4S, v29.4S // .........................*........................ + // mls v18.4S, v10.4S, v29.4S // ..............................*................... + // trn1 v31.4S, v27.4S, v18.4S // ...............................*.................. + // trn2 v3.4S, v27.4S, v18.4S // ................................*................. + // trn2 v21.2D, v2.2D, v3.2D // ..................................*............... + // trn2 v0.2D, v30.2D, v31.2D // .................................*................ + // trn1 v27.2D, v30.2D, v31.2D // ...................................*.............. + // trn1 v19.2D, v2.2D, v3.2D // ....................................*............. + // ldr q7, [x4], #8 // .......*.......................................... + // ldr q22, [x4], #16 // ................*................................. + // add v2.4S, v27.4S, v19.4S // .......................................*.......... + // add v4.4S, v0.4S, v21.4S // .....................................*............ + // sub v3.4S, v27.4S, v19.4S // ........................................*......... + // add v18.4S, v2.4S, v4.4S // .........................................*........ + // sub v2.4S, v2.4S, v4.4S // ..........................................*....... + // sqrdmulh v23.4S, v3.4S, v22.S[1] // ...........................................*...... + // srshr v10.4S, v18.4S, #23 // ............................................*..... + // mul v24.4S, v2.4S, v7.S[0] // .............................................*.... + // mls v18.4S, v10.4S, v29.4S // ..............................................*... + // sqrdmulh v16.4S, v2.4S, v7.S[1] // ...............................................*.. + // sub v2.4S, v0.4S, v21.4S // ......................................*........... + // mul v25.4S, v3.4S, v22.S[0] // ................................................*. + // str q18, [x0], #(16*4) // .................................................* + + sub count, count, #1 +layer5678_start: + ld4 {v11.4S, v12.4S, v13.4S, v14.4S}, [x0] // e................................................................ + mul v10.4S, v2.4S, v22.S[2] // ............................................*.................... + ldr q20, [x3, #48] // ....e............................................................ + ldr q18, [x3, #64] // .....e........................................................... + ldr q6, [x3], #(6*16) // .e............................................................... + // gap // ................................................................. + mls v25.4S, v23.4S, v29.4S // .........................................*....................... + ldr q0, [x3, #-64] // ...e............................................................. + // gap // ................................................................. + ldr q4, [x3, #-16] // ......e.......................................................... + // gap // ................................................................. + // gap // ................................................................. + add v31.4S, v11.4S, v12.4S // ........e........................................................ + sub v12.4S, v11.4S, v12.4S // .......e......................................................... + ldr q5, [x3, #-80] // ..e.............................................................. + sqrdmulh v21.4S, v2.4S, v22.S[3] // .............................................*................... + add v17.4S, v13.4S, v14.4S // .............e................................................... + // gap // ................................................................. + sub v11.4S, v13.4S, v14.4S // ............e.................................................... + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v3.4S, v12.4S, v20.4S // ..........e...................................................... + // gap // ................................................................. + // gap // ................................................................. + sub v30.4S, v31.4S, v17.4S // .................e............................................... + // gap // ................................................................. + // gap // ................................................................. + mul v14.4S, v12.4S, v0.4S // .........e....................................................... + add v28.4S, v31.4S, v17.4S // ..................e.............................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v19.4S, v11.4S, v4.4S // ...............e................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v14.4S, v3.4S, v29.4S // ...........e..................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v13.4S, v11.4S, v18.4S // ..............e.................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v13.4S, v19.4S, v29.4S // ................e................................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v10.4S, v21.4S, v29.4S // ..............................................*.................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v27.4S, v30.4S, v6.4S // ...................e............................................. + // gap // ................................................................. + // gap // ................................................................. + sub v3.4S, v14.4S, v13.4S // ......................e.......................................... + // gap // ................................................................. + // gap // ................................................................. + add v2.4S, v14.4S, v13.4S // .......................e......................................... + sqrdmulh v0.4S, v30.4S, v5.4S // ....................e............................................ + // gap // ................................................................. + sub v8.4S, v25.4S, v10.4S // ....................................................*............ + // gap // ................................................................. + // gap // ................................................................. + add v13.4S, v25.4S, v10.4S // .....................................................*........... + sqrdmulh v10.4S, v3.4S, v5.4S // .........................e....................................... + // gap // ................................................................. + trn1 v30.4S, v28.4S, v2.4S // ...........................e..................................... + // gap // ................................................................. + // gap // ................................................................. + mul v18.4S, v3.4S, v6.4S // ........................e........................................ + trn2 v2.4S, v28.4S, v2.4S // ............................e.................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v27.4S, v0.4S, v29.4S // .....................e........................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v18.4S, v10.4S, v29.4S // ..........................e...................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v14.4S, v8.4S, v7.S[1] // .......................................................*......... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + trn1 v31.4S, v27.4S, v18.4S // .............................e................................... + trn2 v3.4S, v27.4S, v18.4S // ..............................e.................................. + // gap // ................................................................. + mls v24.4S, v16.4S, v29.4S // ...................................................*............. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + srshr v25.4S, v13.4S, #23 // ...........................................................*..... + trn2 v21.2D, v2.2D, v3.2D // ................................e................................ + trn2 v0.2D, v30.2D, v31.2D // ...............................e................................. + // gap // ................................................................. + trn1 v27.2D, v30.2D, v31.2D // .................................e............................... + trn1 v19.2D, v2.2D, v3.2D // ..................................e.............................. + // gap // ................................................................. + mul v26.4S, v8.4S, v7.S[0] // ......................................................*.......... + ldr q7, [x4], #8 // ...................................e............................. + ldr q22, [x4], #16 // ....................................e............................ + str q24, [x0, #-32] // ...............................................................*. + // gap // ................................................................. + // gap // ................................................................. + add v2.4S, v27.4S, v19.4S // ......................................e.......................... + add v4.4S, v0.4S, v21.4S // ...........................................e..................... + // gap // ................................................................. + sub v3.4S, v27.4S, v19.4S // .....................................e........................... + mls v26.4S, v14.4S, v29.4S // ........................................................*........ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v13.4S, v25.4S, v29.4S // ............................................................*.... + add v18.4S, v2.4S, v4.4S // ................................................e................ + // gap // ................................................................. + sub v2.4S, v2.4S, v4.4S // ...............................................e................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v23.4S, v3.4S, v22.S[1] // ........................................e........................ + // gap // ................................................................. + // gap // ................................................................. + str q26, [x0, #-16] // ................................................................* + srshr v10.4S, v18.4S, #23 // .........................................................e....... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v24.4S, v2.4S, v7.S[0] // .................................................e............... + str q13, [x0, #-48] // ..............................................................*.. + // gap // ................................................................. + // gap // ................................................................. + mls v18.4S, v10.4S, v29.4S // ..........................................................e...... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v16.4S, v2.4S, v7.S[1] // ..................................................e.............. + sub v2.4S, v0.4S, v21.4S // ..........................................e...................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v25.4S, v3.4S, v22.S[0] // .......................................e......................... + // gap // ................................................................. + // gap // ................................................................. + str q18, [x0], #(16*4) // .............................................................e... + // gap // ................................................................. + // gap // ................................................................. + + // original source code + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0] // e................................................................e........................................................... + // ldr q0, [x3], #(6*16) // ....e............................................................|...e....................................................... + // ldr q4, [x3, #(-6*16 + 1*16)] // ..........e......................................................|.........e................................................. + // ldr q1, [x3, #(-6*16 + 2*16)] // ......e..........................................................|.....e..................................................... + // ldr q5, [x3, #(-6*16 + 3*16)] // ..e..............................................................|.e......................................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // ...e.............................................................|..e........................................................ + // ldr q6, [x3, #(-6*16 + 5*16)] // .......e.........................................................|......e.................................................... + // sub v24.4s, v8.4s, v9.4s // .........e.......................................................|........e.................................................. + // add v8.4s, v8.4s, v9.4s // ........e........................................................|.......e................................................... + // mul v9.4s, v24.4s, v1.4s // ................e................................................|...............e........................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ..............e..................................................|.............e............................................. + // mls v9.4s, v24.4s, v29.4s // ...................e.............................................|..................e........................................ + // sub v24.4s, v10.4s, v11.4s // .............e...................................................|............e.............................................. + // add v10.4s, v10.4s, v11.4s // ............e....................................................|...........e............................................... + // mul v11.4s, v24.4s, v2.4s // ....................e............................................|...................e....................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ..................e..............................................|.................e......................................... + // mls v11.4s, v24.4s, v29.4s // .....................e...........................................|....................e...................................... + // sub v24.4s, v8.4s, v10.4s // ...............e.................................................|..............e............................................ + // add v8.4s, v8.4s, v10.4s // .................e...............................................|................e.......................................... + // mul v10.4s, v24.4s, v0.4s // .......................e.........................................|......................e.................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..........................e......................................|.........................e................................. + // mls v10.4s, v24.4s, v29.4s // .................................e...............................|................................e.......................... + // sub v24.4s, v9.4s, v11.4s // ........................e........................................|.......................e................................... + // add v9.4s, v9.4s, v11.4s // .........................e.......................................|........................e.................................. + // mul v11.4s, v24.4s, v0.4s // ...............................e.................................|..............................e............................ + // sqrdmulh v24.4s, v24.4s, v4.4s // .............................e...................................|............................e.............................. + // mls v11.4s, v24.4s, v29.4s // ..................................e..............................|.................................e......................... + // trn1 v25.4s, v8.4s, v9.4s // ..............................e..................................|.............................e............................. + // trn2 v26.4s, v8.4s, v9.4s // ................................e................................|...............................e........................... + // trn1 v27.4s, v10.4s, v11.4s // ....................................e............................|...................................e....................... + // trn2 v28.4s, v10.4s, v11.4s // .....................................e...........................|....................................e...................... + // trn2 v10.2d, v25.2d, v27.2d // .........................................e.......................|........................................e.................. + // trn2 v11.2d, v26.2d, v28.2d // ........................................e........................|.......................................e................... + // trn1 v8.2d, v25.2d, v27.2d // ..........................................e......................|.........................................e................. + // trn1 v9.2d, v26.2d, v28.2d // ...........................................e.....................|..........................................e................ + // ldr q1, [x4], #8 // .............................................e...................|............................................e.............. + // ldr q0, [x4], #16 // ..............................................e..................|.............................................e............. + // sub v24.4s, v8.4s, v9.4s // ..................................................e..............|.................................................e......... + // add v8.4s, v8.4s, v9.4s // ................................................e................|...............................................e........... + // mul v9.4s, v24.4s, v0.s[0] // ...............................................................e.|........................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................e.........|......................................................e.... + // mls v9.4s, v24.4s, v29.4s // .....*...........................................................|....*...................................................... + // sub v24.4s, v10.4s, v11.4s // ..............................................................e..|........................................................... + // add v10.4s, v10.4s, v11.4s // .................................................e...............|................................................e.......... + // mul v11.4s, v24.4s, v0.s[2] // .*...............................................................|*.......................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........*.....................................................|..........*................................................ + // mls v11.4s, v24.4s, v29.4s // ......................*..........................................|.....................*..................................... + // sub v24.4s, v8.4s, v10.4s // ......................................................e..........|.....................................................e..... + // add v8.4s, v8.4s, v10.4s // .....................................................e...........|....................................................e...... + // mul v10.4s, v24.4s, v1.s[0] // ..........................................................e......|.........................................................e. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................................................e...|........................................................... + // mls v10.4s, v24.4s, v29.4s // ......................................*..........................|.....................................*..................... + // sub v24.4s, v9.4s, v11.4s // ...........................*.....................................|..........................*................................ + // add v9.4s, v9.4s, v11.4s // ............................*....................................|...........................*............................... + // mul v11.4s, v24.4s, v1.s[0] // ............................................*....................|...........................................*............... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................*.............................|..................................*........................ + // mls v11.4s, v24.4s, v29.4s // ...................................................*.............|..................................................*........ + // srshr v24.4S, v8.4S, #23 // .........................................................e.......|........................................................e.. + // mls v8.4s, v24.4s, v29.4s // ............................................................e....|........................................................... + // srshr v24.4S, v9.4S, #23 // .......................................*.........................|......................................*.................... + // mls v9.4s, v24.4s, v29.4s // ....................................................*............|...................................................*....... + // str q8, [x0], #(16*4) // ................................................................e|........................................................... + // str q9, [x0, #(-16*4 + 1*16)] // ...........................................................*.....|..........................................................* + // str q10, [x0, #(-16*4 + 2*16)] // ...............................................*.................|..............................................*............ + // str q11, [x0, #(-16*4 + 3*16)] // ........................................................*........|.......................................................*... + + sub count, count, #1 + cbnz count, layer5678_start + sqrdmulh v20.4S, v2.4S, v22.S[3] // ..*............ + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + mul v4.4S, v2.4S, v22.S[2] // *.............. + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + mls v25.4S, v23.4S, v29.4S // .*............. + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + mls v4.4S, v20.4S, v29.4S // ...*........... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + mls v24.4S, v16.4S, v29.4S // .......*....... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + sub v30.4S, v25.4S, v4.4S // ....*.......... + add v10.4S, v25.4S, v4.4S // .....*......... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + str q24, [x0, #-32] // ..........*.... + // gap // ............... + // gap // ............... + // gap // ............... + sqrdmulh v28.4S, v30.4S, v7.S[1] // ......*........ + srshr v4.4S, v10.4S, #23 // ........*...... + // gap // ............... + // gap // ............... + // gap // ............... + mul v23.4S, v30.4S, v7.S[0] // .........*..... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + mls v10.4S, v4.4S, v29.4S // ............*.. + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + mls v23.4S, v28.4S, v29.4S // ...........*... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + str q10, [x0, #-48] // ..............* + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + str q23, [x0, #-16] // .............*. + // gap // ............... + // gap // ............... + + // original source code + // mul v10.4S, v2.4S, v22.S[2] // .*............. + // mls v25.4S, v23.4S, v29.4S // ..*............ + // sqrdmulh v21.4S, v2.4S, v22.S[3] // *.............. + // mls v10.4S, v21.4S, v29.4S // ...*........... + // sub v8.4S, v25.4S, v10.4S // .....*......... + // add v13.4S, v25.4S, v10.4S // ......*........ + // sqrdmulh v14.4S, v8.4S, v7.S[1] // ........*...... + // mls v24.4S, v16.4S, v29.4S // ....*.......... + // srshr v25.4S, v13.4S, #23 // .........*..... + // mul v26.4S, v8.4S, v7.S[0] // ..........*.... + // str q24, [x0, #-32] // .......*....... + // mls v26.4S, v14.4S, v29.4S // ............*.. + // mls v13.4S, v25.4S, v29.4S // ...........*... + // str q26, [x0, #-16] // ..............* + // str q13, [x0, #-48] // .............*. + + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 + ldr q10, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + ldr q18, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q22, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + ldr q27, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + ldr q13, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + ldr q15, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + ldr q24, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + ldr q8, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + sub v11.4S, v18.4S, v10.4S // ...............................*........................................................................................................................................................................................................................................................ + add v10.4S, v18.4S, v10.4S // ................................*....................................................................................................................................................................................................................................................... + ldr q18, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + ldr q23, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + add v16.4S, v22.4S, v15.4S // .....................................*.................................................................................................................................................................................................................................................. + sub v9.4S, v22.4S, v15.4S // ....................................*................................................................................................................................................................................................................................................... + ldr q20, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + sub v12.4S, v13.4S, v27.4S // .....................*.................................................................................................................................................................................................................................................................. + ldr q22, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + sqrdmulh v17.4S, v11.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... + add v27.4S, v13.4S, v27.4S // ......................*................................................................................................................................................................................................................................................................. + mul v15.4S, v11.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... + add v13.4S, v8.4S, v24.4S // .................*...................................................................................................................................................................................................................................................................... + sub v24.4S, v8.4S, v24.4S // ................*....................................................................................................................................................................................................................................................................... + add v19.4S, v23.4S, v20.4S // ...........................*............................................................................................................................................................................................................................................................ + sqrdmulh v21.4S, v12.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... + sub v11.4S, v13.4S, v27.4S // ........................................................*............................................................................................................................................................................................................................... + mul v28.4S, v12.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ + add v8.4S, v13.4S, v27.4S // .........................................................*.............................................................................................................................................................................................................................. + add v12.4S, v19.4S, v10.4S // ...................................................................*.................................................................................................................................................................................................................... + mls v28.4S, v21.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + sub v21.4S, v8.4S, v12.4S // ................................................................................................*....................................................................................................................................................................................... + add v8.4S, v8.4S, v12.4S // .................................................................................................*...................................................................................................................................................................................... + sub v10.4S, v19.4S, v10.4S // ..................................................................*..................................................................................................................................................................................................................... + mul v13.4S, v11.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. + sub v14.4S, v18.4S, v22.4S // .........................................*.............................................................................................................................................................................................................................................. + add v27.4S, v18.4S, v22.4S // ..........................................*............................................................................................................................................................................................................................................. + sqrdmulh v22.4S, v11.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ + sub v23.4S, v23.4S, v20.4S // ..........................*............................................................................................................................................................................................................................................................. + mls v15.4S, v17.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + sqrdmulh v17.4S, v24.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... + sqrdmulh v18.4S, v9.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + mul v20.4S, v24.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... + mls v20.4S, v17.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v10.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. + mul v17.4S, v23.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... + sub v19.4S, v20.4S, v28.4S // .............................................................*.......................................................................................................................................................................................................................... + add v12.4S, v20.4S, v28.4S // ..............................................................*......................................................................................................................................................................................................................... + mul v28.4S, v10.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... + sqrdmulh v10.4S, v23.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... + mls v28.4S, v24.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + sqrdmulh v24.4S, v19.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... + mls v17.4S, v10.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + ldr q10, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + mul v23.4S, v19.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ + mls v23.4S, v24.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + ldr q24, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + add v19.4S, v17.4S, v15.4S // ........................................................................*............................................................................................................................................................................................................... + mls v13.4S, v22.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + sub v15.4S, v17.4S, v15.4S // .......................................................................*................................................................................................................................................................................................................ + ldr q22, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + add v20.4S, v24.4S, v10.4S // ....................................................*................................................................................................................................................................................................................................... + sub v10.4S, v24.4S, v10.4S // ...................................................*.................................................................................................................................................................................................................................... + mul v17.4S, v9.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. + sub v9.4S, v16.4S, v27.4S // ............................................................................*........................................................................................................................................................................................................... + add v11.4S, v16.4S, v27.4S // .............................................................................*.......................................................................................................................................................................................................... + ldr q27, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v15.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. + mul v16.4S, v15.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. + mls v17.4S, v18.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + sub v18.4S, v27.4S, v22.4S // ..............................................*......................................................................................................................................................................................................................................... + add v27.4S, v27.4S, v22.4S // ...............................................*........................................................................................................................................................................................................................................ + mls v16.4S, v24.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + mul v15.4S, v10.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. + sqrdmulh v10.4S, v10.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. + sqrdmulh v24.4S, v18.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... + sqrdmulh v22.4S, v9.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + mls v15.4S, v10.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + mul v10.4S, v18.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... + mls v10.4S, v24.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + sub v24.4S, v12.4S, v19.4S // .....................................................................................................*.................................................................................................................................................................................. + mul v18.4S, v9.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... + add v9.4S, v12.4S, v19.4S // ......................................................................................................*................................................................................................................................................................................. + sub v12.4S, v27.4S, v20.4S // ......................................................................................*................................................................................................................................................................................................. + mls v18.4S, v22.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + sub v22.4S, v10.4S, v15.4S // ...........................................................................................*............................................................................................................................................................................................ + add v27.4S, v27.4S, v20.4S // .......................................................................................*................................................................................................................................................................................................ + sqrdmulh v19.4S, v12.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. + add v15.4S, v10.4S, v15.4S // ............................................................................................*........................................................................................................................................................................................... + mul v10.4S, v12.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... + add v20.4S, v11.4S, v27.4S // .....................................................................................................................*.................................................................................................................................................................. + sub v27.4S, v11.4S, v27.4S // ....................................................................................................................*................................................................................................................................................................... + sqrdmulh v12.4S, v24.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... + add v11.4S, v23.4S, v16.4S // ................................................................................................................*....................................................................................................................................................................... + mul v24.4S, v24.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ + sub v16.4S, v23.4S, v16.4S // ...............................................................................................................*........................................................................................................................................................................ + mls v10.4S, v19.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + sub v19.4S, v8.4S, v20.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v20.4S // .........................................................................................................................................*.............................................................................................................................................. + sub v20.4S, v13.4S, v28.4S // ..........................................................................................................*............................................................................................................................................................................. + sqrdmulh v23.4S, v14.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... + add v13.4S, v13.4S, v28.4S // ...........................................................................................................*............................................................................................................................................................................ + mul v14.4S, v14.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ + sub v28.4S, v18.4S, v10.4S // ..............................................................................................................................*......................................................................................................................................................... + add v18.4S, v18.4S, v10.4S // ...............................................................................................................................*........................................................................................................................................................ + sqrdmulh v10.4S, v22.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... + mls v14.4S, v23.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + mul v23.4S, v22.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... + mls v23.4S, v10.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + add v22.4S, v17.4S, v14.4S // ..................................................................................*..................................................................................................................................................................................................... + sub v14.4S, v17.4S, v14.4S // .................................................................................*...................................................................................................................................................................................................... + mls v24.4S, v12.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + add v12.4S, v22.4S, v15.4S // ..........................................................................................................................*............................................................................................................................................................. + sqrdmulh v17.4S, v16.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + sub v15.4S, v22.4S, v15.4S // .........................................................................................................................*.............................................................................................................................................................. + sqrdmulh v22.4S, v27.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ + sub v10.4S, v9.4S, v12.4S // .............................................................................................................................................*.......................................................................................................................................... + mul v27.4S, v27.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. + add v9.4S, v9.4S, v12.4S // ..............................................................................................................................................*......................................................................................................................................... + sqrdmulh v12.4S, v10.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + mul v16.4S, v16.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + mls v16.4S, v17.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + mul v17.4S, v10.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + sqrdmulh v10.4S, v19.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + mul v19.4S, v19.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + mls v17.4S, v12.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + mul v12.4S, v21.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... + sqrdmulh v21.4S, v21.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... + mls v27.4S, v22.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + mls v19.4S, v10.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + mls v12.4S, v21.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + sqrdmulh v10.4S, v14.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + mul v22.4S, v28.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + sub v21.4S, v12.4S, v27.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v12.4S, v27.4S // .............................................................................................................................................................*.......................................................................................................................... + mul v27.4S, v14.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + mls v27.4S, v10.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + add v10.4S, v13.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... + sub v18.4S, v13.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... + mul v13.4S, v15.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ + sqrdmulh v15.4S, v15.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + sub v14.4S, v27.4S, v23.4S // ...................................................................................................................................*.................................................................................................................................................... + sub count, count, #1 +layer1234_start: + sqrdmulh v28.4S, v28.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + add v23.4S, v27.4S, v23.4S // ....................................................................................................................................*................................................................................................................................................... + cmge v27.4S, v31.4S, v19.4S // ................................................................................................................................................................................*....................................................................................................... + mls v13.4S, v15.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + sub v15.4S, v11.4S, v23.4S // .......................................................................................................................................................*................................................................................................................................ + add v11.4S, v11.4S, v23.4S // ........................................................................................................................................................*............................................................................................................................... + sqrdmulh v23.4S, v20.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + mls v22.4S, v28.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + cmge v28.4S, v19.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + mul v20.4S, v20.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + sub v28.4S, v27.4S, v28.4S // ..................................................................................................................................................................................*..................................................................................................... + sqrdmulh v27.4S, v14.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mls v20.4S, v23.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + mul v23.4S, v14.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + mls v23.4S, v27.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mls v19.4S, v28.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sub v28.4S, v24.4S, v13.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v24.4S, v13.4S // ..................................................................................................................................................................*..................................................................................................................... + sqrdmulh v27.4S, v15.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + sub v24.4S, v20.4S, v22.4S // ......................................................................................................................................................................*................................................................................................................. + add v14.4S, v20.4S, v22.4S // .......................................................................................................................................................................*................................................................................................................ + mul v22.4S, v15.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + add v15.4S, v16.4S, v23.4S // ............................................................................................................................................................................*........................................................................................................... + str q19, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + sub v19.4S, v16.4S, v23.4S // ...........................................................................................................................................................................*............................................................................................................ + cmge v23.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + sqrdmulh v16.4S, v18.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + mul v20.4S, v18.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + mls v22.4S, v27.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + cmge v18.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + mul v27.4S, v19.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sub v18.4S, v23.4S, v18.4S // ......................................................................................................................................................................................*................................................................................................. + sqrdmulh v19.4S, v19.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + mls v17.4S, v18.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sqrdmulh v18.4S, v21.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + mul v21.4S, v21.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + mls v20.4S, v16.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + cmge v16.4S, v22.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + sqrdmulh v23.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + cmge v9.4S, v31.4S, v22.4S // ............................................................................................................................................................................................*........................................................................................... + mls v21.4S, v18.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + sub v16.4S, v9.4S, v16.4S // ..............................................................................................................................................................................................*......................................................................................... + sqrdmulh v18.4S, v24.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mul v24.4S, v24.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + mls v27.4S, v19.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + cmge v19.4S, v31.4S, v20.4S // ........................................................................................................................................................................................*............................................................................................... + mls v24.4S, v18.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + cmge v18.4S, v20.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + sqrdmulh v9.4S, v28.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + sub v19.4S, v19.4S, v18.4S // ..........................................................................................................................................................................................*............................................................................................. + sqrdmulh v18.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v20.4S, v19.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + mls v22.4S, v16.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + str q20, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + mul v11.4S, v28.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + str q22, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + cmge v16.4S, v31.4S, v24.4S // ........................................................................................................................................................................................................*............................................................................... + ldr q22, [x1, #464] // .......e................................................................................................................................................................................................................................................................................ + mls v11.4S, v9.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v9.4S, v24.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v20.4S, v31.4S, v27.4S // ............................................................................................................................................................................................................*........................................................................... + mls v19.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + cmge v18.4S, v27.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + sub v9.4S, v16.4S, v9.4S // ..........................................................................................................................................................................................................*............................................................................. + mls v17.4S, v23.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + cmge v23.4S, v21.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + sub v18.4S, v20.4S, v18.4S // ..............................................................................................................................................................................................................*......................................................................... + mls v24.4S, v9.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + cmge v9.4S, v11.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + cmge v16.4S, v31.4S, v11.4S // ....................................................................................................................................................................................................*................................................................................... + mls v27.4S, v18.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + cmge v18.4S, v31.4S, v21.4S // ................................................................................................................................................................................................*....................................................................................... + cmge v20.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + sub v9.4S, v16.4S, v9.4S // ......................................................................................................................................................................................................*................................................................................. + sqrdmulh v28.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + sub v18.4S, v18.4S, v23.4S // ..................................................................................................................................................................................................*..................................................................................... + str q24, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + cmge v8.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + str q27, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + cmge v24.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + mls v21.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + sub v18.4S, v24.4S, v20.4S // ......................................................................................................................................................................................................................................................*................................. + ldr q24, [x1, #400] // ......e................................................................................................................................................................................................................................................................................. + cmge v27.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + mls v11.4S, v9.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + sub v23.4S, v8.4S, v27.4S // ..............................................................................................................................................................................................................................................................*......................... + mls v16.4S, v28.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + ldr q8, [x1, #272] // ....e................................................................................................................................................................................................................................................................................... + str q21, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + sub v27.4S, v24.4S, v22.4S // ...............................e........................................................................................................................................................................................................................................................ + sqrdmulh v9.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + str q11, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + mul v13.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + cmge v11.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + mls v19.4S, v23.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + cmge v21.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + mls v17.4S, v18.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + sqrdmulh v18.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + str q19, [x1, #192] // ...................................................................................................................................................................................................................................................................................*.... + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + str q17, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sqrdmulh v17.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + sub v15.4S, v11.4S, v21.4S // ..................................................................................................................................................................................................................................................*..................................... + mul v12.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + sqrdmulh v11.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + add v10.4S, v24.4S, v22.4S // ................................e....................................................................................................................................................................................................................................................... + mls v13.4S, v9.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + mls v20.4S, v18.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + cmge v22.4S, v31.4S, v13.4S // ....................................................................................................................................................................................................................................................................*................... + ldr q19, [x1, #336] // .....e.................................................................................................................................................................................................................................................................................. + cmge v18.4S, v13.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + mls v12.4S, v11.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + sqrdmulh v21.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + sub v28.4S, v22.4S, v18.4S // ......................................................................................................................................................................................................................................................................*................. + cmge v9.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + cmge v24.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + cmge v11.4S, v31.4S, v12.4S // ........................................................................................................................................................................................................................................................*............................... + mls v23.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + cmge v18.4S, v12.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + sub v24.4S, v24.4S, v9.4S // ..................................................................................................................................................................................................................................................................*..................... + mls v22.4S, v21.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + sub v18.4S, v11.4S, v18.4S // ..........................................................................................................................................................................................................................................................*............................. + ldr q17, [x1, #592] // .........e.............................................................................................................................................................................................................................................................................. + mls v20.4S, v24.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + add v24.4S, v8.4S, v19.4S // ...........................e............................................................................................................................................................................................................................................................ + cmge v9.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + mls v12.4S, v18.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + cmge v21.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... + cmge v11.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + mls v13.4S, v28.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + cmge v18.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + str q20, [x1, #256] // ....................................................................................................................................................................................................................................................................................*... + sub v28.4S, v21.4S, v9.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v16.4S, v15.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + sub v20.4S, v8.4S, v19.4S // ..........................e............................................................................................................................................................................................................................................................. + ldr q21, [x1, #80] // .e...................................................................................................................................................................................................................................................................................... + sqrdmulh v8.4S, v27.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... + ldr q14, [x1, #720] // ...........e............................................................................................................................................................................................................................................................................ + ldr q9, [x1, #16] // e....................................................................................................................................................................................................................................................................................... + sub v11.4S, v11.4S, v18.4S // ..........................................................................................................................................................................................................................................................................*............. + mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + ldr q15, [x1, #208] // ...e.................................................................................................................................................................................................................................................................................... + ldr q28, [x1, #144] // ..e..................................................................................................................................................................................................................................................................................... + str q12, [x1, #128] // ..................................................................................................................................................................................................................................................................................*..... + str q13, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + add v13.4S, v24.4S, v10.4S // ...................................................................e.................................................................................................................................................................................................................... + sub v12.4S, v24.4S, v10.4S // ..................................................................e..................................................................................................................................................................................................................... + mul v19.4S, v20.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... + ldr q24, [x1, #528] // ........e............................................................................................................................................................................................................................................................................... + add v10.4S, v9.4S, v21.4S // .................e...................................................................................................................................................................................................................................................................... + add v18.4S, v28.4S, v15.4S // ......................e................................................................................................................................................................................................................................................................. + sqrdmulh v20.4S, v20.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... + sub v28.4S, v28.4S, v15.4S // .....................e.................................................................................................................................................................................................................................................................. + str q23, [x1, #448] // .......................................................................................................................................................................................................................................................................................* + sub v21.4S, v9.4S, v21.4S // ................e....................................................................................................................................................................................................................................................................... + mls v22.4S, v11.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + add v9.4S, v10.4S, v18.4S // .........................................................e.............................................................................................................................................................................................................................. + sqrdmulh v11.4S, v28.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + ldr q16, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. + sqrdmulh v15.4S, v21.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... + mul v23.4S, v28.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + mul v22.4S, v21.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... + add v21.4S, v24.4S, v17.4S // .....................................e.................................................................................................................................................................................................................................................. + sub v24.4S, v24.4S, v17.4S // ....................................e................................................................................................................................................................................................................................................... + ldr q17, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... + ldr q28, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ + mls v22.4S, v15.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... + sub v15.4S, v10.4S, v18.4S // ........................................................e............................................................................................................................................................................................................................... + add v10.4S, v16.4S, v14.4S // ..........................................e............................................................................................................................................................................................................................................. + sub v16.4S, v16.4S, v14.4S // .........................................e.............................................................................................................................................................................................................................................. + mul v14.4S, v27.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... + sub v27.4S, v9.4S, v13.4S // ................................................................................................e....................................................................................................................................................................................... + mls v23.4S, v11.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. + add v18.4S, v21.4S, v10.4S // .............................................................................e.......................................................................................................................................................................................................... + sub v21.4S, v21.4S, v10.4S // ............................................................................e........................................................................................................................................................................................................... + mls v14.4S, v8.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... + add v8.4S, v9.4S, v13.4S // .................................................................................................e...................................................................................................................................................................................... + add v11.4S, v17.4S, v28.4S // ....................................................e................................................................................................................................................................................................................................... + mls v19.4S, v20.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... + sub v10.4S, v17.4S, v28.4S // ...................................................e.................................................................................................................................................................................................................................... + add v9.4S, v22.4S, v23.4S // ..............................................................e......................................................................................................................................................................................................................... + sqrdmulh v13.4S, v12.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. + mul v12.4S, v12.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... + add v28.4S, v19.4S, v14.4S // ........................................................................e............................................................................................................................................................................................................... + sqrdmulh v20.4S, v24.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ + mul v17.4S, v24.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. + sub v24.4S, v19.4S, v14.4S // .......................................................................e................................................................................................................................................................................................................ + sub v14.4S, v22.4S, v23.4S // .............................................................e.......................................................................................................................................................................................................................... + mls v12.4S, v13.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. + sub v23.4S, v9.4S, v28.4S // .....................................................................................................e.................................................................................................................................................................................. + add v9.4S, v9.4S, v28.4S // ......................................................................................................e................................................................................................................................................................................. + mul v19.4S, v21.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... + sqrdmulh v21.4S, v21.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ + sqrdmulh v28.4S, v14.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... + mul v22.4S, v14.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ + sqrdmulh v13.4S, v15.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ + mul v15.4S, v15.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. + mls v19.4S, v21.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... + mul v21.4S, v27.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... + sqrdmulh v27.4S, v27.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... + mul v14.4S, v16.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ + sqrdmulh v16.4S, v16.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... + mls v22.4S, v28.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... + ldr q28, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... + mls v15.4S, v13.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... + ldr q13, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... + mls v17.4S, v20.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... + mls v14.4S, v16.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... + add v16.4S, v28.4S, v13.4S // ...............................................e........................................................................................................................................................................................................................................ + mls v21.4S, v27.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... + sub v27.4S, v16.4S, v11.4S // ......................................................................................e................................................................................................................................................................................................. + add v16.4S, v16.4S, v11.4S // .......................................................................................e................................................................................................................................................................................................ + sqrdmulh v20.4S, v10.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. + sqrdmulh v11.4S, v27.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. + sub v28.4S, v28.4S, v13.4S // ..............................................e......................................................................................................................................................................................................................................... + mul v13.4S, v10.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. + sqrdmulh v10.4S, v28.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... + mls v13.4S, v20.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ + sub v20.4S, v15.4S, v12.4S // ..........................................................................................................e............................................................................................................................................................................. + add v12.4S, v15.4S, v12.4S // ...........................................................................................................e............................................................................................................................................................................ + mul v15.4S, v28.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... + add v28.4S, v18.4S, v16.4S // .....................................................................................................................e.................................................................................................................................................................. + sub v18.4S, v18.4S, v16.4S // ....................................................................................................................e................................................................................................................................................................... + mls v15.4S, v10.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... + sub v16.4S, v17.4S, v14.4S // .................................................................................e...................................................................................................................................................................................................... + add v14.4S, v17.4S, v14.4S // ..................................................................................e..................................................................................................................................................................................................... + sqrdmulh v10.4S, v18.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ + sub v17.4S, v8.4S, v28.4S // ........................................................................................................................................e............................................................................................................................................... + add v8.4S, v8.4S, v28.4S // .........................................................................................................................................e.............................................................................................................................................. + mul v28.4S, v27.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... + add v27.4S, v15.4S, v13.4S // ............................................................................................e........................................................................................................................................................................................... + sub v15.4S, v15.4S, v13.4S // ...........................................................................................e............................................................................................................................................................................................ + mul v13.4S, v18.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. + mls v13.4S, v10.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... + add v10.4S, v14.4S, v27.4S // ..........................................................................................................................e............................................................................................................................................................. + sub v14.4S, v14.4S, v27.4S // .........................................................................................................................e.............................................................................................................................................................. + mls v28.4S, v11.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. + sub v18.4S, v9.4S, v10.4S // .............................................................................................................................................e.......................................................................................................................................... + add v9.4S, v9.4S, v10.4S // ..............................................................................................................................................e......................................................................................................................................... + mul v11.4S, v24.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. + sqrdmulh v27.4S, v24.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. + add v10.4S, v19.4S, v28.4S // ...............................................................................................................................e........................................................................................................................................................ + sub v28.4S, v19.4S, v28.4S // ..............................................................................................................................e......................................................................................................................................................... + sqrdmulh v19.4S, v23.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... + mul v24.4S, v23.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ + mls v24.4S, v19.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. + mul v19.4S, v17.4S, v0.S[0] // ..........................................................................................................................................e............................................................................................................................................. + sqrdmulh v23.4S, v17.4S, v0.S[1] // ...........................................................................................................................................e............................................................................................................................................ + mls v11.4S, v27.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ + mul v17.4S, v18.4S, v0.S[0] // ...............................................................................................................................................e........................................................................................................................................ + sqrdmulh v27.4S, v18.4S, v0.S[1] // ................................................................................................................................................e....................................................................................................................................... + mls v19.4S, v23.4S, v29.4S // ............................................................................................................................................e........................................................................................................................................... + sub v23.4S, v22.4S, v11.4S // ...............................................................................................................e........................................................................................................................................................................ + add v11.4S, v22.4S, v11.4S // ................................................................................................................e....................................................................................................................................................................... + sqrdmulh v22.4S, v16.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... + mls v17.4S, v27.4S, v29.4S // .................................................................................................................................................e...................................................................................................................................... + mul v27.4S, v16.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... + mls v27.4S, v22.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. + sqrdmulh v18.4S, v15.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... + sqrdmulh v22.4S, v23.4S, v0.S[3] // ..................................................................................................................e..................................................................................................................................................................... + mul v16.4S, v23.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... + mul v23.4S, v15.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... + mls v23.4S, v18.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ + sub v18.4S, v12.4S, v10.4S // ..................................................................................................................................................e..................................................................................................................................... + add v10.4S, v12.4S, v10.4S // ...................................................................................................................................................e.................................................................................................................................... + add v12.4S, v21.4S, v13.4S // .............................................................................................................................................................e.......................................................................................................................... + sqrdmulh v15.4S, v14.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... + sub v21.4S, v21.4S, v13.4S // ............................................................................................................................................................e........................................................................................................................... + mul v13.4S, v14.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + mls v16.4S, v22.4S, v29.4S // ...................................................................................................................e.................................................................................................................................................................... + mul v22.4S, v28.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... + sub v14.4S, v27.4S, v23.4S // ...................................................................................................................................e.................................................................................................................................................... + + // original source code + // ldr q8, [x1, #0] // ....................................................................................e.......................................................................................................................................|...............................................................................................................................................e........................ + // ldr q9, [x1, #(1*(512/8))] // .................................................................................e..........................................................................................................................................|............................................................................................................................................e........................... + // ldr q10, [x1, #(2*(512/8))] // ........................................................................................e...................................................................................................................................|...................................................................................................................................................e.................... + // ldr q11, [x1, #(3*(512/8))] // .......................................................................................e....................................................................................................................................|..................................................................................................................................................e..................... + // ldr q12, [x1, #(4*(512/8))] // ...............................e............................................................................................................................................................................................|..........................................................................................e............................................................................. + // ldr q13, [x1, #(5*(512/8))] // ......................................................e.....................................................................................................................................................................|.................................................................................................................e...................................................... + // ldr q14, [x1, #(6*(512/8))] // ..........................e.................................................................................................................................................................................................|.....................................................................................e.................................................................................. + // ldr q15, [x1, #(7*(512/8))] // e...........................................................................................................................................................................................................................|...........................................................e............................................................................................................ + // ldr q16, [x1, #(8*(512/8))] // ..............................................................................................e.............................................................................................................................|.........................................................................................................................................................e.............. + // ldr q17, [x1, #(9*(512/8))] // ....................................................................e.......................................................................................................................................................|...............................................................................................................................e........................................ + // ldr q18, [x1, #(10*(512/8))] // .........................................................................................................e..................................................................................................................|....................................................................................................................................................................e... + // ldr q19, [x1, #(11*(512/8))] // ...................................................................................e........................................................................................................................................|..............................................................................................................................................e......................... + // ldr q20, [x1, #(12*(512/8))] // .......................................................................................................................................................e....................................................................|........................................................................................................................................................................ + // ldr q21, [x1, #(13*(512/8))] // .........................................................................................................................................................e..................................................................|........................................................................................................................................................................ + // ldr q22, [x1, #(14*(512/8))] // ................................................................................................................e...........................................................................................................|........................................................................................................................................................................ + // ldr q23, [x1, #(15*(512/8))] // .................................................................................................................e..........................................................................................................|........................................................................................................................................................................ + // sub v24.4s, v8.4s, v9.4s // ....................................................................................................e.......................................................................................................................|...............................................................................................................................................................e........ + // add v8.4s, v8.4s, v9.4s // ...............................................................................................e............................................................................................................................|..........................................................................................................................................................e............. + // mul v9.4s, v24.4s, v3.s[2] // .............................................................................................................e..............................................................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[3] // ..........................................................................................................e.................................................................................................................|.....................................................................................................................................................................e.. + // mls v9.4s, v24.4s, v29.4s // ..................................................................................................................e.........................................................................................................|........................................................................................................................................................................ + // sub v24.4s, v10.4s, v11.4s // ..................................................................................................e.........................................................................................................................|.............................................................................................................................................................e.......... + // add v10.4s, v10.4s, v11.4s // ................................................................................................e...........................................................................................................................|...........................................................................................................................................................e............ + // mul v11.4s, v24.4s, v4.s[0] // ...........................................................................................................e................................................................................................................|......................................................................................................................................................................e. + // sqrdmulh v24.4s, v24.4s, v4.s[1] // .......................................................................................................e....................................................................................................................|..................................................................................................................................................................e..... + // mls v11.4s, v24.4s, v29.4s // ........................................................................................................................e...................................................................................................|........................................................................................................................................................................ + // sub v24.4s, v12.4s, v13.4s // ................................................................................e...........................................................................................................................................|...........................................................................................................................................e............................ + // add v12.4s, v12.4s, v13.4s // ......................................................................e.....................................................................................................................................................|.................................................................................................................................e...................................... + // mul v13.4s, v24.4s, v4.s[2] // .............................................................................................e..............................................................................................................................|........................................................................................................................................................e............... + // sqrdmulh v24.4s, v24.4s, v4.s[3] // .................................................................................................e..........................................................................................................................|............................................................................................................................................................e........... + // mls v13.4s, v24.4s, v29.4s // ..............................................................................................................................e.............................................................................................|........................................................................................................................................................................ + // sub v24.4s, v14.4s, v15.4s // .................................e..........................................................................................................................................................................................|............................................................................................e........................................................................... + // add v14.4s, v14.4s, v15.4s // ..................................................e.........................................................................................................................................................................|.............................................................................................................e.......................................................... + // mul v15.4s, v24.4s, v5.s[0] // ......................................................................................................................e.....................................................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v5.s[1] // ..................................................................................e.........................................................................................................................................|.............................................................................................................................................e.......................... + // mls v15.4s, v24.4s, v29.4s // ...........................................................................................................................e................................................................................................|........................................................................................................................................................................ + // sub v24.4s, v16.4s, v17.4s // ...............................................................................................................e............................................................................................................|........................................................................................................................................................................ + // add v16.4s, v16.4s, v17.4s // ..............................................................................................................e.............................................................................................................|........................................................................................................................................................................ + // mul v17.4s, v24.4s, v5.s[2] // .....................................................................................................................................e......................................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v5.s[3] // ....................................................................................................................................e.......................................................................................|........................................................................................................................................................................ + // mls v17.4s, v24.4s, v29.4s // ..........................................................................................................................................................e.................................................................|........................................................................................................................................................................ + // sub v24.4s, v18.4s, v19.4s // .....................................................................................................................e......................................................................................................|........................................................................................................................................................................ + // add v18.4s, v18.4s, v19.4s // ....................................................................................................................e.......................................................................................................|........................................................................................................................................................................ + // mul v19.4s, v24.4s, v6.s[0] // ....................................................................................................................................................e.......................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v6.s[1] // .....................................................................................................................................................e......................................................................|........................................................................................................................................................................ + // mls v19.4s, v24.4s, v29.4s // ...........................................................................................................................................................e................................................................|........................................................................................................................................................................ + // sub v24.4s, v20.4s, v21.4s // ..................................................................................................................................................................e.........................................................|........................................................................................................................................................................ + // add v20.4s, v20.4s, v21.4s // ............................................................................................................................................................e...............................................................|........................................................................................................................................................................ + // mul v21.4s, v24.4s, v6.s[2] // ........................................................................................................................................................................e...................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v6.s[3] // ....................................................................................................................................................................e.......................................................|........................................................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // ...........................................................................................................................................................................e................................................|........................................................................................................................................................................ + // sub v24.4s, v22.4s, v23.4s // ...............................................................................................................................e............................................................................................|........................................................................................................................................................................ + // add v22.4s, v22.4s, v23.4s // .............................................................................................................................e..............................................................................................|........................................................................................................................................................................ + // mul v23.4s, v24.4s, v7.s[0] // ...................................................................................................................................................................e........................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v7.s[1] // ................................................................................................................................................................e...........................................................|........................................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // .....................................................................................................................................................................e......................................................|........................................................................................................................................................................ + // sub v24.4s, v8.4s, v10.4s // ...................................................................................................................e........................................................................................................|........................................................................................................................................................................ + // add v8.4s, v8.4s, v10.4s // ......................................................................................................e.....................................................................................................................|.................................................................................................................................................................e...... + // mul v10.4s, v24.4s, v1.s[2] // ................................................................................................................................................e...........................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...............................................................................................................................................e............................................................................|........................................................................................................................................................................ + // mls v10.4s, v24.4s, v29.4s // ........................................................................................................................................................e...................................................................|........................................................................................................................................................................ + // sub v24.4s, v9.4s, v11.4s // .......................................................................................................................................e....................................................................................|........................................................................................................................................................................ + // add v9.4s, v9.4s, v11.4s // ................................................................................................................................e...........................................................................................|........................................................................................................................................................................ + // mul v11.4s, v24.4s, v1.s[2] // ..............................................................................................................................................e.............................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .............................................................................................................................................e..............................................................................|........................................................................................................................................................................ + // mls v11.4s, v24.4s, v29.4s // ......................................................................................................................................................e.....................................................................|........................................................................................................................................................................ + // sub v24.4s, v12.4s, v14.4s // ............................................................................................e...............................................................................................................................|.......................................................................................................................................................e................ + // add v12.4s, v12.4s, v14.4s // ...........................................................................................e................................................................................................................................|......................................................................................................................................................e................. + // mul v14.4s, v24.4s, v2.s[0] // ..................................................................................................................................e.........................................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .................................................................................................................................e..........................................................................................|........................................................................................................................................................................ + // mls v14.4s, v24.4s, v29.4s // ........................................................................................................................................e...................................................................................|........................................................................................................................................................................ + // sub v24.4s, v13.4s, v15.4s // ......................................................................................................................................e.....................................................................................|........................................................................................................................................................................ + // add v13.4s, v13.4s, v15.4s // ...................................................................................................................................e........................................................................................|........................................................................................................................................................................ + // mul v15.4s, v24.4s, v2.s[0] // ...........................................................................................................................................................................................e................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ............................................................................................................................................................................................e...............................|........................................................................................................................................................................ + // mls v15.4s, v24.4s, v29.4s // ....................................................................................................................................................................................................e.......................|........................................................................................................................................................................ + // sub v24.4s, v16.4s, v18.4s // ..........................................................................................................................e.................................................................................................|........................................................................................................................................................................ + // add v16.4s, v16.4s, v18.4s // .........................................................................................................................e..................................................................................................|........................................................................................................................................................................ + // mul v18.4s, v24.4s, v2.s[2] // ...........................................................................................................................................e................................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ............................................................................................................................................e...............................................................................|........................................................................................................................................................................ + // mls v18.4s, v24.4s, v29.4s // .................................................................................................................................................e..........................................................................|........................................................................................................................................................................ + // sub v24.4s, v17.4s, v19.4s // ............................................................................................................................................................................e...............................................|........................................................................................................................................................................ + // add v17.4s, v17.4s, v19.4s // .............................................................................................................................................................................e..............................................|........................................................................................................................................................................ + // mul v19.4s, v24.4s, v2.s[2] // ............................................................................................................................................................................................................e...............|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..........................................................................................................................................................................................................e.................|........................................................................................................................................................................ + // mls v19.4s, v24.4s, v29.4s // .............................................................................................................................................................................................................e..............|........................................................................................................................................................................ + // sub v24.4s, v20.4s, v22.4s // ..............................................................................................................................................................e.............................................................|........................................................................................................................................................................ + // add v20.4s, v20.4s, v22.4s // ...............................................................................................................................................................e............................................................|........................................................................................................................................................................ + // mul v22.4s, v24.4s, v3.s[0] // .................................................................................................................................................................................e..........................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .................................................................................................................................................................e..........................................................|........................................................................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // ........................................................................................................................................................................................e...................................|........................................................................................................................................................................ + // sub v24.4s, v21.4s, v23.4s // ...................................................................................................................................................................................e........................................|........................................................................................................................................................................ + // add v21.4s, v21.4s, v23.4s // ..................................................................................................................................................................................e.........................................|........................................................................................................................................................................ + // mul v23.4s, v24.4s, v3.s[0] // .................................................................................................................................................................................................................e..........|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ..............................................................................................................................................................................................................e.............|........................................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................................e.........|........................................................................................................................................................................ + // sub v24.4s, v8.4s, v12.4s // .......................................................................................................................e....................................................................................................|........................................................................................................................................................................ + // add v8.4s, v8.4s, v12.4s // ............................................................................................................................e...............................................................................................|........................................................................................................................................................................ + // mul v12.4s, v24.4s, v0.s[2] // ..................................................................................................................................................e.........................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................................................................................................................................e........................................................................|........................................................................................................................................................................ + // mls v12.4s, v24.4s, v29.4s // .............................................................................................................................................................e..............................................................|........................................................................................................................................................................ + // sub v24.4s, v9.4s, v13.4s // .........................................................................................................................................e..................................................................................|........................................................................................................................................................................ + // add v9.4s, v9.4s, v13.4s // ..........................................................................................................................................e.................................................................................|........................................................................................................................................................................ + // mul v13.4s, v24.4s, v0.s[2] // ................................................................................................................................................................................................e...........................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...............................................................................................................................................................................................e............................|........................................................................................................................................................................ + // mls v13.4s, v24.4s, v29.4s // .................................................................................................................................................................................................e..........................|........................................................................................................................................................................ + // sub v24.4s, v10.4s, v14.4s // ......................................................................................................................................................................e.....................................................|........................................................................................................................................................................ + // add v10.4s, v10.4s, v14.4s // .......................................................................................................................................................................e....................................................|........................................................................................................................................................................ + // mul v14.4s, v24.4s, v0.s[2] // ............................................................................................................................................................................................................................|........*............................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ............................................................................................................................................................................................................................|.....*.................................................................................................................................................................. + // mls v14.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|...........*............................................................................................................................................................ + // sub v24.4s, v11.4s, v15.4s // ........................................................................................................................................................................................................e...................|........................................................................................................................................................................ + // add v11.4s, v11.4s, v15.4s // .........................................................................................................................................................................................................e..................|........................................................................................................................................................................ + // mul v15.4s, v24.4s, v0.s[2] // ................................................................................................................................................................................................................e...........|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...............................................................................................................................................................................................................e............|........................................................................................................................................................................ + // mls v15.4s, v24.4s, v29.4s // .........................................................................................................................................................................................................................e..|........................................................................................................................................................................ + // sub v24.4s, v16.4s, v20.4s // ..........................................................................................................................................................................e.................................................|........................................................................................................................................................................ + // add v16.4s, v16.4s, v20.4s // .........................................................................................................................................................................e..................................................|........................................................................................................................................................................ + // mul v20.4s, v24.4s, v1.s[0] // ....................................................................................................................................................................................e.......................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................................................................................e.............................................|........................................................................................................................................................................ + // mls v20.4s, v24.4s, v29.4s // .....................................................................................................................................................................................e......................................|........................................................................................................................................................................ + // sub v24.4s, v17.4s, v21.4s // .......................................................................................................................................................................................e....................................|........................................................................................................................................................................ + // add v17.4s, v17.4s, v21.4s // ......................................................................................................................................................................................e.....................................|........................................................................................................................................................................ + // mul v21.4s, v24.4s, v1.s[0] // ........................................................................................................................................................................................................................e...|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................................................................................................................................................................................................................e.....|........................................................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|..*..................................................................................................................................................................... + // sub v24.4s, v18.4s, v22.4s // ..............................................................................................................................................................................................e.............................|........................................................................................................................................................................ + // add v18.4s, v18.4s, v22.4s // .............................................................................................................................................................................................e..............................|........................................................................................................................................................................ + // mul v22.4s, v24.4s, v1.s[0] // ..........................................................................................................................................................................................................................e.|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ............................................................................................................................................................................................................................*........................................................................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|......*................................................................................................................................................................. + // sub v24.4s, v19.4s, v23.4s // ...........................................................................................................................................................................................................................e|........................................................................................................................................................................ + // add v19.4s, v19.4s, v23.4s // ............................................................................................................................................................................................................................|*....................................................................................................................................................................... + // mul v23.4s, v24.4s, v1.s[0] // ............................................................................................................................................................................................................................|............*........................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ............................................................................................................................................................................................................................|..........*............................................................................................................................................................. + // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|.............*.......................................................................................................................................................... + // sub v24.4s, v8.4s, v16.4s // ...............................................................................................................................................................................e............................................|........................................................................................................................................................................ + // add v8.4s, v8.4s, v16.4s // ................................................................................................................................................................................e...........................................|........................................................................................................................................................................ + // mul v16.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................e.........................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................................................e........................|........................................................................................................................................................................ + // mls v16.4s, v24.4s, v29.4s // .......................................................................................................................................................................................................e....................|........................................................................................................................................................................ + // sub v24.4s, v9.4s, v17.4s // .........................................................................................................................................................................................e..................................|........................................................................................................................................................................ + // add v9.4s, v9.4s, v17.4s // ..........................................................................................................................................................................................e.................................|........................................................................................................................................................................ + // mul v17.4s, v24.4s, v0.s[0] // .....................................................................................................................................................................................................e......................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................................................................................................................................................e.....................|........................................................................................................................................................................ + // mls v17.4s, v24.4s, v29.4s // ...........................................................................................................................................................................................................e................|........................................................................................................................................................................ + // sub v24.4s, v10.4s, v18.4s // ...................................................................................................................................................................................................................e........|........................................................................................................................................................................ + // add v10.4s, v10.4s, v18.4s // ....................................................................................................................................................................................................................e.......|........................................................................................................................................................................ + // mul v18.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|..........................*............................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|.........................*.............................................................................................................................................. + // mls v18.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|....................................*................................................................................................................................... + // sub v24.4s, v11.4s, v19.4s // ............................................................................................................................................................................................................................|...*.................................................................................................................................................................... + // add v11.4s, v11.4s, v19.4s // ............................................................................................................................................................................................................................|....*................................................................................................................................................................... + // mul v19.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|....................*................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|.................*...................................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|...........................*............................................................................................................................................ + // sub v24.4s, v12.4s, v20.4s // .......................................................................................................................................................................................................................e....|........................................................................................................................................................................ + // add v12.4s, v12.4s, v20.4s // .....................................................................................................................................................................................................................e......|........................................................................................................................................................................ + // mul v20.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|..................................*..................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|.................................*...................................................................................................................................... + // mls v20.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|.........................................*.............................................................................................................................. + // sub v24.4s, v13.4s, v21.4s // ............................................................................................................................................................................................................................|...............*........................................................................................................................................................ + // add v13.4s, v13.4s, v21.4s // ............................................................................................................................................................................................................................|................*....................................................................................................................................................... + // mul v21.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|........................................................*............................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|.................................................*...................................................................................................................... + // mls v21.4s, v24.4s, v29.4s // .*..........................................................................................................................................................................................................................|............................................................*........................................................................................................... + // sub v24.4s, v14.4s, v22.4s // ............................................................................................................................................................................................................................|..................*..................................................................................................................................................... + // add v14.4s, v14.4s, v22.4s // ............................................................................................................................................................................................................................|...................*.................................................................................................................................................... + // mul v22.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|............................................*........................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|...........................................*............................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|...............................................*........................................................................................................................ + // sub v24.4s, v15.4s, v23.4s // ............................................................................................................................................................................................................................|.......................*................................................................................................................................................ + // add v15.4s, v15.4s, v23.4s // ............................................................................................................................................................................................................................|.....................*.................................................................................................................................................. + // mul v23.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|.............................*.......................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|...............................*........................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|.............................................*.......................................................................................................................... + // cmge v27.4s, v31.4s, v16.4s // ............................................................................................................................................................................................................................|.*...................................................................................................................................................................... + // cmge v28.4s, v16.4s, v30.4s // ............................................................................................................................................................................................................................|.......*................................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................................................|.........*.............................................................................................................................................................. + // mls v16.4s, v28.4s, v29.4s // ............................................................................................................................................................................................................................|..............*......................................................................................................................................................... + // cmge v27.4s, v31.4s, v17.4s // ............................................................................................................................................................................................................................|........................*............................................................................................................................................... + // cmge v28.4s, v17.4s, v30.4s // ............................................................................................................................................................................................................................|............................*........................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................................................|..............................*......................................................................................................................................... + // mls v17.4s, v28.4s, v29.4s // ............................................................................................................................................................................................................................|................................*....................................................................................................................................... + // cmge v27.4s, v31.4s, v18.4s // ............................................................................................................................................................................................................................|..............................................*......................................................................................................................... + // cmge v28.4s, v18.4s, v30.4s // ............................................................................................................................................................................................................................|................................................*....................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................................................|..................................................*..................................................................................................................... + // mls v18.4s, v28.4s, v29.4s // ............................................................................................................................................................................................................................|....................................................*................................................................................................................... + // cmge v27.4s, v31.4s, v19.4s // ............................................................................................................................................................................................................................|........................................*............................................................................................................................... + // cmge v28.4s, v19.4s, v30.4s // ............................................................................................................................................................................................................................|.....................................*.................................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................................................|..........................................*............................................................................................................................. + // mls v19.4s, v28.4s, v29.4s // ............................................................................................................................................................................................................................|.....................................................*.................................................................................................................. + // cmge v27.4s, v31.4s, v20.4s // ..............*.............................................................................................................................................................................................................|.........................................................................*.............................................................................................. + // cmge v28.4s, v20.4s, v30.4s // ........*...................................................................................................................................................................................................................|...................................................................*.................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ..................*.........................................................................................................................................................................................................|.............................................................................*.......................................................................................... + // mls v20.4s, v28.4s, v29.4s // ........................*...................................................................................................................................................................................................|...................................................................................*.................................................................................... + // cmge v27.4s, v31.4s, v21.4s // ............*...............................................................................................................................................................................................................|.......................................................................*................................................................................................ + // cmge v28.4s, v21.4s, v30.4s // ...........*................................................................................................................................................................................................................|......................................................................*................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ................*...........................................................................................................................................................................................................|...........................................................................*............................................................................................ + // mls v21.4s, v28.4s, v29.4s // ............................*...............................................................................................................................................................................................|.......................................................................................*................................................................................ + // cmge v27.4s, v31.4s, v22.4s // ............................................................................................................................................................................................................................|..........................................................*............................................................................................................. + // cmge v28.4s, v22.4s, v30.4s // ..*.........................................................................................................................................................................................................................|.............................................................*.......................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ......*.....................................................................................................................................................................................................................|.................................................................*...................................................................................................... + // mls v22.4s, v28.4s, v29.4s // ..........*.................................................................................................................................................................................................................|.....................................................................*.................................................................................................. + // cmge v27.4s, v31.4s, v23.4s // ...*........................................................................................................................................................................................................................|..............................................................*......................................................................................................... + // cmge v28.4s, v23.4s, v30.4s // .....*......................................................................................................................................................................................................................|................................................................*....................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .........*..................................................................................................................................................................................................................|....................................................................*................................................................................................... + // mls v23.4s, v28.4s, v29.4s // .............*..............................................................................................................................................................................................................|........................................................................*............................................................................................... + // str q16, [x1, #(8*(512/8))] // ............................................................................................................................................................................................................................|......................*................................................................................................................................................. + // str q17, [x1, #(9*(512/8))] // ............................................................................................................................................................................................................................|...................................*.................................................................................................................................... + // str q18, [x1, #(10*(512/8))] // ............................................................................................................................................................................................................................|.......................................................*................................................................................................................ + // str q19, [x1, #(11*(512/8))] // ............................................................................................................................................................................................................................|.........................................................*.............................................................................................................. + // str q20, [x1, #(12*(512/8))] // ................................*...........................................................................................................................................................................................|...........................................................................................*............................................................................ + // str q21, [x1, #(13*(512/8))] // ...................................*........................................................................................................................................................................................|..............................................................................................*......................................................................... + // str q22, [x1, #(14*(512/8))] // ...................*........................................................................................................................................................................................................|..............................................................................*......................................................................................... + // str q23, [x1, #(15*(512/8))] // ......................*.....................................................................................................................................................................................................|.................................................................................*...................................................................................... + // mul v16.4s, v8.4s, v25.4s // ....................*.......................................................................................................................................................................................................|...............................................................................*........................................................................................ + // sqrdmulh v8.4s, v8.4s, v26.4s // .................*..........................................................................................................................................................................................................|............................................................................*........................................................................................... + // mls v16.4s, v8.4s, v29.4s // ..............................*.............................................................................................................................................................................................|.........................................................................................*.............................................................................. + // mul v17.4s, v9.4s, v25.4s // ............................................................................................................................................................................................................................|.......................................*................................................................................................................................ + // sqrdmulh v9.4s, v9.4s, v26.4s // ............................................................................................................................................................................................................................|......................................*................................................................................................................................. + // mls v17.4s, v9.4s, v29.4s // .......*....................................................................................................................................................................................................................|..................................................................*..................................................................................................... + // mul v18.4s, v10.4s, v25.4s // ................................................*...........................................................................................................................................................................|...........................................................................................................*............................................................ + // sqrdmulh v10.4s, v10.4s, v26.4s // .................................................*..........................................................................................................................................................................|............................................................................................................*........................................................... + // mls v18.4s, v10.4s, v29.4s // ........................................................*...................................................................................................................................................................|...................................................................................................................*.................................................... + // mul v19.4s, v11.4s, v25.4s // ............................................................................................................................................................................................................................|......................................................*................................................................................................................. + // sqrdmulh v11.4s, v11.4s, v26.4s // ............................................................................................................................................................................................................................|...................................................*.................................................................................................................... + // mls v19.4s, v11.4s, v29.4s // ....*.......................................................................................................................................................................................................................|...............................................................*........................................................................................................ + // mul v20.4s, v12.4s, v25.4s // ...........................................*................................................................................................................................................................................|......................................................................................................*................................................................. + // sqrdmulh v12.4s, v12.4s, v26.4s // .........................................*..................................................................................................................................................................................|....................................................................................................*................................................................... + // mls v20.4s, v12.4s, v29.4s // ....................................................*.......................................................................................................................................................................|...............................................................................................................*........................................................ + // mul v21.4s, v13.4s, v25.4s // ....................................*.......................................................................................................................................................................................|...............................................................................................*........................................................................ + // sqrdmulh v13.4s, v13.4s, v26.4s // ..................................*.........................................................................................................................................................................................|.............................................................................................*.......................................................................... + // mls v21.4s, v13.4s, v29.4s // ...................................................*........................................................................................................................................................................|..............................................................................................................*......................................................... + // mul v22.4s, v14.4s, v25.4s // ............................................................*...............................................................................................................................................................|.......................................................................................................................*................................................ + // sqrdmulh v14.4s, v14.4s, v26.4s // .........................................................*..................................................................................................................................................................|....................................................................................................................*................................................... + // mls v22.4s, v14.4s, v29.4s // ..................................................................*.........................................................................................................................................................|.............................................................................................................................*.......................................... + // mul v23.4s, v15.4s, v25.4s // .............................................*..............................................................................................................................................................................|........................................................................................................*............................................................... + // sqrdmulh v15.4s, v15.4s, v26.4s // ..............................................*.............................................................................................................................................................................|.........................................................................................................*.............................................................. + // mls v23.4s, v15.4s, v29.4s // ...............................................................*............................................................................................................................................................|..........................................................................................................................*............................................. + // cmge v27.4s, v31.4s, v16.4s // .....................................*......................................................................................................................................................................................|................................................................................................*....................................................................... + // cmge v28.4s, v16.4s, v30.4s // .......................................*....................................................................................................................................................................................|..................................................................................................*..................................................................... + // sub v28.4s, v27.4s, v28.4s // ...............................................*............................................................................................................................................................................|..........................................................................................................*............................................................. + // mls v16.4s, v28.4s, v29.4s // ...............................................................................*............................................................................................................................................|..........................................................................................................................................*............................. + // cmge v27.4s, v31.4s, v17.4s // .......................*....................................................................................................................................................................................................|..................................................................................*..................................................................................... + // cmge v28.4s, v17.4s, v30.4s // ...............*............................................................................................................................................................................................................|..........................................................................*............................................................................................. + // sub v28.4s, v27.4s, v28.4s // .........................*..................................................................................................................................................................................................|....................................................................................*................................................................................... + // mls v17.4s, v28.4s, v29.4s // ........................................*...................................................................................................................................................................................|...................................................................................................*.................................................................... + // cmge v27.4s, v31.4s, v18.4s // ..............................................................*.............................................................................................................................................................|.........................................................................................................................*.............................................. + // cmge v28.4s, v18.4s, v30.4s // ................................................................*...........................................................................................................................................................|...........................................................................................................................*............................................ + // sub v28.4s, v27.4s, v28.4s // ...................................................................*........................................................................................................................................................|..............................................................................................................................*......................................... + // mls v18.4s, v28.4s, v29.4s // ........................................................................*...................................................................................................................................................|...................................................................................................................................*.................................... + // cmge v27.4s, v31.4s, v19.4s // .....................*......................................................................................................................................................................................................|................................................................................*....................................................................................... + // cmge v28.4s, v19.4s, v30.4s // ...........................*................................................................................................................................................................................................|......................................................................................*................................................................................. + // sub v28.4s, v27.4s, v28.4s // .............................*..............................................................................................................................................................................................|........................................................................................*............................................................................... + // mls v19.4s, v28.4s, v29.4s // ......................................*.....................................................................................................................................................................................|.................................................................................................*...................................................................... + // cmge v27.4s, v31.4s, v20.4s // .............................................................*..............................................................................................................................................................|........................................................................................................................*............................................... + // cmge v28.4s, v20.4s, v30.4s // ...........................................................*................................................................................................................................................................|......................................................................................................................*................................................. + // sub v28.4s, v27.4s, v28.4s // .................................................................*..........................................................................................................................................................|............................................................................................................................*........................................... + // mls v20.4s, v28.4s, v29.4s // .....................................................................*......................................................................................................................................................|................................................................................................................................*....................................... + // cmge v27.4s, v31.4s, v21.4s // .....................................................*......................................................................................................................................................................|................................................................................................................*....................................................... + // cmge v28.4s, v21.4s, v30.4s // .......................................................*....................................................................................................................................................................|..................................................................................................................*..................................................... + // sub v28.4s, v27.4s, v28.4s // ..........................................................*.................................................................................................................................................................|.....................................................................................................................*.................................................. + // mls v21.4s, v28.4s, v29.4s // ...........................................................................*................................................................................................................................................|......................................................................................................................................*................................. + // cmge v27.4s, v31.4s, v22.4s // ..........................................................................*.................................................................................................................................................|.....................................................................................................................................*.................................. + // cmge v28.4s, v22.4s, v30.4s // ............................................................................*...............................................................................................................................................|.......................................................................................................................................*................................ + // sub v28.4s, v27.4s, v28.4s // .....................................................................................*......................................................................................................................................|................................................................................................................................................*....................... + // mls v22.4s, v28.4s, v29.4s // .....................................................................................................*......................................................................................................................|................................................................................................................................................................*....... + // cmge v27.4s, v31.4s, v23.4s // .........................................................................*..................................................................................................................................................|....................................................................................................................................*................................... + // cmge v28.4s, v23.4s, v30.4s // .......................................................................*....................................................................................................................................................|..................................................................................................................................*..................................... + // sub v28.4s, v27.4s, v28.4s // ..............................................................................*.............................................................................................................................................|.........................................................................................................................................*.............................. + // mls v23.4s, v28.4s, v29.4s // ......................................................................................*.....................................................................................................................................|.................................................................................................................................................*...................... + // str q16, [x1], #(16) // ........................................................................................................*...................................................................................................................|...................................................................................................................................................................*.... + // str q17, [x1, #(-16 + 1*(512/8))] // ............................................*...............................................................................................................................................................................|.......................................................................................................*................................................................ + // str q18, [x1, #(-16 + 2*(512/8))] // .........................................................................................*..................................................................................................................................|....................................................................................................................................................*................... + // str q19, [x1, #(-16 + 3*(512/8))] // ..........................................*.................................................................................................................................................................................|.....................................................................................................*.................................................................. + // str q20, [x1, #(-16 + 4*(512/8))] // .............................................................................*..............................................................................................................................................|........................................................................................................................................*............................... + // str q21, [x1, #(-16 + 5*(512/8))] // ..........................................................................................*.................................................................................................................................|.....................................................................................................................................................*.................. + // str q22, [x1, #(-16 + 6*(512/8))] // ............................................................................................................*...............................................................................................................|.......................................................................................................................................................................* + // str q23, [x1, #(-16 + 7*(512/8))] // ...................................................................................................*........................................................................................................................|..............................................................................................................................................................*......... + + sub count, count, #1 + cbnz count, layer1234_start + sqrdmulh v28.4S, v28.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + add v23.4S, v27.4S, v23.4S // ....................................................................................................................................*................................................................................................................................................... + sqrdmulh v27.4S, v20.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + mls v13.4S, v15.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + sub v15.4S, v11.4S, v23.4S // .......................................................................................................................................................*................................................................................................................................ + add v11.4S, v11.4S, v23.4S // ........................................................................................................................................................*............................................................................................................................... + mul v23.4S, v20.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + cmge v20.4S, v19.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + mls v22.4S, v28.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + cmge v28.4S, v31.4S, v19.4S // ................................................................................................................................................................................*....................................................................................................... + mls v23.4S, v27.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + sub v28.4S, v28.4S, v20.4S // ..................................................................................................................................................................................*..................................................................................................... + sqrdmulh v20.4S, v15.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mls v19.4S, v28.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sqrdmulh v27.4S, v14.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mul v28.4S, v15.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + str q19, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + mul v19.4S, v14.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + cmge v14.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + sqrdmulh v15.4S, v18.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + mls v19.4S, v27.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + mul v18.4S, v18.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sub v14.4S, v27.4S, v14.4S // ......................................................................................................................................................................................*................................................................................................. + mls v18.4S, v15.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + add v15.4S, v16.4S, v19.4S // ............................................................................................................................................................................*........................................................................................................... + mls v17.4S, v14.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sub v16.4S, v16.4S, v19.4S // ...........................................................................................................................................................................*............................................................................................................ + str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + mls v28.4S, v20.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + sub v27.4S, v23.4S, v22.4S // ......................................................................................................................................................................*................................................................................................................. + add v14.4S, v23.4S, v22.4S // .......................................................................................................................................................................*................................................................................................................ + cmge v22.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + sqrdmulh v20.4S, v21.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + cmge v23.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + mul v19.4S, v21.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + sub v17.4S, v24.4S, v13.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v24.4S, v13.4S // ..................................................................................................................................................................*..................................................................................................................... + sub v22.4S, v23.4S, v22.4S // ..........................................................................................................................................................................................*............................................................................................. + sqrdmulh v24.4S, v16.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + cmge v21.4S, v28.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + mul v23.4S, v16.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + cmge v16.4S, v31.4S, v28.4S // ............................................................................................................................................................................................*........................................................................................... + mls v18.4S, v22.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + mul v22.4S, v27.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + sqrdmulh v27.4S, v27.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + sqrdmulh v18.4S, v17.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + mls v19.4S, v20.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + mul v20.4S, v17.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + mls v20.4S, v18.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v18.4S, v31.4S, v19.4S // ................................................................................................................................................................................................*....................................................................................... + mls v23.4S, v24.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + cmge v24.4S, v19.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v22.4S, v27.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + sub v27.4S, v16.4S, v21.4S // ..............................................................................................................................................................................................*......................................................................................... + sub v18.4S, v18.4S, v24.4S // ..................................................................................................................................................................................................*..................................................................................... + mls v28.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + cmge v27.4S, v31.4S, v20.4S // ....................................................................................................................................................................................................*................................................................................... + cmge v24.4S, v20.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + cmge v16.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + sqrdmulh v9.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + cmge v21.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + str q28, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + sub v27.4S, v27.4S, v24.4S // ......................................................................................................................................................................................................*................................................................................. + mls v19.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + cmge v18.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... + sub v24.4S, v21.4S, v16.4S // ..............................................................................................................................................................................................................*......................................................................... + mls v20.4S, v27.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + cmge v27.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + mls v23.4S, v24.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + str q19, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + sub v18.4S, v18.4S, v27.4S // ..........................................................................................................................................................................................................*............................................................................. + sqrdmulh v27.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + str q20, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + mls v22.4S, v18.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + sqrdmulh v10.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + sqrdmulh v22.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v19.4S, v22.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + sqrdmulh v24.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + cmge v8.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + mls v17.4S, v9.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + cmge v11.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + mls v16.4S, v27.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sub v27.4S, v8.4S, v11.4S // ..............................................................................................................................................................................................................................................................*......................... + mls v18.4S, v10.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + cmge v10.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + cmge v8.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + cmge v11.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + sqrdmulh v12.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + cmge v23.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + sub v10.4S, v10.4S, v8.4S // ......................................................................................................................................................................................................................................................*................................. + cmge v8.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + sub v11.4S, v11.4S, v23.4S // ..................................................................................................................................................................................................................................................*..................................... + sqrdmulh v13.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + cmge v23.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + sqrdmulh v9.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + sub v8.4S, v8.4S, v23.4S // ..........................................................................................................................................................................................................................................................*............................. + mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + mls v22.4S, v24.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + mls v20.4S, v12.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + mls v23.4S, v9.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + cmge v24.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + mls v17.4S, v10.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + cmge v10.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + cmge v15.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + mls v21.4S, v13.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v13.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + sub v10.4S, v24.4S, v10.4S // ..........................................................................................................................................................................................................................................................................*............. + mls v16.4S, v11.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + cmge v24.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... + str q17, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + sub v13.4S, v15.4S, v13.4S // ..................................................................................................................................................................................................................................................................*..................... + mls v18.4S, v8.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + cmge v15.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v8.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + mls v20.4S, v13.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + cmge v13.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + sub v15.4S, v24.4S, v15.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v19.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + sub v18.4S, v8.4S, v13.4S // ......................................................................................................................................................................................................................................................................*................. + mls v23.4S, v15.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + mls v22.4S, v10.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + mls v21.4S, v18.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + str q23, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_m1_firestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_m1_firestorm.s new file mode 100644 index 0000000..3d99a3b --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_m1_firestorm.s @@ -0,0 +1,2096 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_opt_m1_firestorm + .global _intt_dilithium_1234_5678_opt_m1_firestorm + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_opt_m1_firestorm: +_intt_dilithium_1234_5678_opt_m1_firestorm: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 + ld4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x0] // *.......................................... + ldr q18, [x3, #32] // .*......................................... + ldr q22, [x3, #48] // ..*........................................ + ldr q26, [x3, #64] // ...*....................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q7, [x3, #80] // ....*...................................... + ldr q27, [x3, #16] // .........*................................. + ldr q15, [x3], #(6*16) // .................*......................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q19, [x4], #8 // .......................................*... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v30.4S, v10.4S, v11.4S // .....*..................................... + add v10.4S, v10.4S, v11.4S // .......*................................... + sub v24.4S, v12.4S, v13.4S // ......*.................................... + add v13.4S, v12.4S, v13.4S // ........*.................................. + ldr q11, [x4], #16 // .........................................*. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mul v18.4S, v30.4S, v18.4S // ..........*................................ + sqrdmulh v22.4S, v30.4S, v22.4S // ...........*............................... + mul v26.4S, v24.4S, v26.4S // ............*.............................. + sqrdmulh v7.4S, v24.4S, v7.4S // .............*............................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v30.4S, v10.4S, v13.4S // ..............*............................ + add v10.4S, v10.4S, v13.4S // ...................*....................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v18.4S, v22.4S, v29.4S // ...............*........................... + mls v26.4S, v7.4S, v29.4S // ................*.......................... + sqrdmulh v22.4S, v30.4S, v27.4S // ..................*........................ + mul v7.4S, v30.4S, v15.4S // ......................*.................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v13.4S, v18.4S, v26.4S // ....................*...................... + add v18.4S, v18.4S, v26.4S // ........................*.................. + mls v7.4S, v22.4S, v29.4S // ..........................*................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn2 v30.4S, v10.4S, v18.4S // ...........................*............... + sqrdmulh v22.4S, v13.4S, v27.4S // .....................*..................... + mul v26.4S, v13.4S, v15.4S // .......................*................... + trn1 v10.4S, v10.4S, v18.4S // ............................*.............. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v26.4S, v22.4S, v29.4S // .........................*................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn1 v22.4S, v7.4S, v26.4S // .............................*............. + trn2 v18.4S, v7.4S, v26.4S // ..............................*............ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn2 v28.2D, v30.2D, v18.2D // .................................*......... + trn1 v9.2D, v30.2D, v18.2D // ..................................*........ + trn1 v17.2D, v10.2D, v22.2D // ...............................*........... + trn2 v13.2D, v10.2D, v22.2D // ................................*.......... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v3.4S, v17.4S, v9.4S // ...................................*....... + add v2.4S, v13.4S, v28.4S // ....................................*...... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v10.4S, v3.4S, v2.4S // .....................................*..... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + srshr v18.4S, v10.4S, #23 // ......................................*.... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v10.4S, v18.4S, v29.4S // ........................................*.. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + str q10, [x0], #(16*4) // ..........................................* + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + + // original source code + // ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x0] // *.......................................... + // ldr q18, [x3, #32] // .*......................................... + // ldr q23, [x3, #48] // ..*........................................ + // ldr q12, [x3, #64] // ...*....................................... + // ldr q24, [x3, #80] // ....*...................................... + // sub v5.4S, v14.4S, v15.4S // ........*.................................. + // sub v6.4S, v16.4S, v17.4S // ..........*................................ + // add v21.4S, v14.4S, v15.4S // .........*................................. + // add v30.4S, v16.4S, v17.4S // ...........*............................... + // ldr q14, [x3, #16] // .....*..................................... + // mul v1.4S, v5.4S, v18.4S // .............*............................. + // sqrdmulh v0.4S, v5.4S, v23.4S // ..............*............................ + // mul v16.4S, v6.4S, v12.4S // ...............*........................... + // sqrdmulh v9.4S, v6.4S, v24.4S // ................*.......................... + // sub v6.4S, v21.4S, v30.4S // .................*......................... + // mls v1.4S, v0.4S, v29.4S // ...................*....................... + // mls v16.4S, v9.4S, v29.4S // ....................*...................... + // ldr q12, [x3], #(6*16) // ......*.................................... + // sqrdmulh v18.4S, v6.4S, v14.4S // .....................*..................... + // add v20.4S, v21.4S, v30.4S // ..................*........................ + // sub v31.4S, v1.4S, v16.4S // .......................*................... + // sqrdmulh v22.4S, v31.4S, v14.4S // ...........................*............... + // mul v15.4S, v6.4S, v12.4S // ......................*.................... + // mul v14.4S, v31.4S, v12.4S // ............................*.............. + // add v2.4S, v1.4S, v16.4S // ........................*.................. + // mls v14.4S, v22.4S, v29.4S // ..............................*............ + // mls v15.4S, v18.4S, v29.4S // .........................*................. + // trn2 v9.4S, v20.4S, v2.4S // ..........................*................ + // trn1 v2.4S, v20.4S, v2.4S // .............................*............. + // trn1 v18.4S, v15.4S, v14.4S // ...............................*........... + // trn2 v10.4S, v15.4S, v14.4S // ................................*.......... + // trn1 v17.2D, v2.2D, v18.2D // ...................................*....... + // trn2 v13.2D, v2.2D, v18.2D // ....................................*...... + // trn2 v28.2D, v9.2D, v10.2D // .................................*......... + // trn1 v9.2D, v9.2D, v10.2D // ..................................*........ + // add v3.4S, v17.4S, v9.4S // .....................................*..... + // add v2.4S, v13.4S, v28.4S // ......................................*.... + // add v21.4S, v3.4S, v2.4S // .......................................*... + // srshr v12.4S, v21.4S, #23 // ........................................*.. + // ldr q19, [x4], #8 // .......*................................... + // mls v21.4S, v12.4S, v29.4S // .........................................*. + // ldr q11, [x4], #16 // ............*.............................. + // str q21, [x0], #(16*4) // ..........................................* + + sub count, count, #1 +layer5678_start: + sub v26.4S, v17.4S, v9.4S // .....................................*........................... + // gap // ................................................................. + // gap // ................................................................. + ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x0] // e................................................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v4.4S, v26.4S, v11.S[0] // .......................................*......................... + sqrdmulh v8.4S, v26.4S, v11.S[1] // ........................................*........................ + ldr q18, [x3, #32] // ...e............................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q23, [x3, #48] // ....e............................................................ + ldr q12, [x3, #64] // .....e........................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q24, [x3, #80] // ......e.......................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v2.4S, v3.4S, v2.4S // ...............................................*................. + mls v4.4S, v8.4S, v29.4S // .........................................*....................... + sub v8.4S, v13.4S, v28.4S // ..........................................*...................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v5.4S, v14.4S, v15.4S // .......e......................................................... + sub v6.4S, v16.4S, v17.4S // ............e.................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v28.4S, v8.4S, v11.S[3] // .............................................*................... + add v21.4S, v14.4S, v15.4S // ........e........................................................ + add v30.4S, v16.4S, v17.4S // .............e................................................... + ldr q14, [x3, #16] // ..e.............................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v1.4S, v5.4S, v18.4S // .........e....................................................... + sqrdmulh v0.4S, v5.4S, v23.4S // ..........e...................................................... + mul v16.4S, v6.4S, v12.4S // ..............e.................................................. + sqrdmulh v9.4S, v6.4S, v24.4S // ...............e................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v23.4S, v8.4S, v11.S[2] // ............................................*.................... + sqrdmulh v17.4S, v2.4S, v19.S[1] // ..................................................*.............. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v6.4S, v21.4S, v30.4S // .................e............................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v1.4S, v0.4S, v29.4S // ...........e..................................................... + mls v16.4S, v9.4S, v29.4S // ................e................................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v23.4S, v28.4S, v29.4S // ..............................................*.................. + // gap // ................................................................. + // gap // ................................................................. + ldr q12, [x3], #(6*16) // .e............................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v18.4S, v6.4S, v14.4S // ....................e............................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + add v20.4S, v21.4S, v30.4S // ..................e.............................................. + sub v31.4S, v1.4S, v16.4S // ......................e.......................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + add v28.4S, v4.4S, v23.4S // .....................................................*........... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v22.4S, v31.4S, v14.4S // .........................e....................................... + mul v15.4S, v6.4S, v12.4S // ...................e............................................. + mul v14.4S, v31.4S, v12.4S // ........................e........................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v25.4S, v4.4S, v23.4S // ....................................................*............ + mul v12.4S, v2.4S, v19.S[0] // .................................................*............... + srshr v6.4S, v28.4S, #23 // ...........................................................*..... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + add v2.4S, v1.4S, v16.4S // .......................e......................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v14.4S, v22.4S, v29.4S // ..........................e...................................... + mls v15.4S, v18.4S, v29.4S // .....................e........................................... + sqrdmulh v31.4S, v25.4S, v19.S[1] // .......................................................*......... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v12.4S, v17.4S, v29.4S // ...................................................*............. + mls v28.4S, v6.4S, v29.4S // ............................................................*.... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + trn2 v9.4S, v20.4S, v2.4S // ............................e.................................... + trn1 v2.4S, v20.4S, v2.4S // ...........................e..................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + trn1 v18.4S, v15.4S, v14.4S // .............................e................................... + trn2 v10.4S, v15.4S, v14.4S // ..............................e.................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q12, [x0, #-32] // ...............................................................*. + mul v12.4S, v25.4S, v19.S[0] // ......................................................*.......... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + trn1 v17.2D, v2.2D, v18.2D // .................................e............................... + trn2 v13.2D, v2.2D, v18.2D // ...............................e................................. + str q28, [x0, #-48] // ..............................................................*.. + trn2 v28.2D, v9.2D, v10.2D // ................................e................................ + trn1 v9.2D, v9.2D, v10.2D // ..................................e.............................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + add v3.4S, v17.4S, v9.4S // ......................................e.......................... + add v2.4S, v13.4S, v28.4S // ...........................................e..................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v12.4S, v31.4S, v29.4S // ........................................................*........ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + add v21.4S, v3.4S, v2.4S // ................................................e................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q12, [x0, #-16] // ................................................................* + srshr v12.4S, v21.4S, #23 // .........................................................e....... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q19, [x4], #8 // ...................................e............................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v21.4S, v12.4S, v29.4S // ..........................................................e...... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q11, [x4], #16 // ....................................e............................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q21, [x0], #(16*4) // .............................................................e... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + + // original source code + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0] // e...............................................................|e.......................................................... + // ldr q0, [x3], #(6*16) // ..........................e.....................................|..........................e................................ + // ldr q4, [x3, #(-6*16 + 1*16)] // ...............e................................................|...............e........................................... + // ldr q1, [x3, #(-6*16 + 2*16)] // ...e............................................................|...e....................................................... + // ldr q5, [x3, #(-6*16 + 3*16)] // ....e...........................................................|....e...................................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // .....e..........................................................|.....e..................................................... + // ldr q6, [x3, #(-6*16 + 5*16)] // ......e.........................................................|......e.................................................... + // sub v24.4s, v8.4s, v9.4s // ..........e.....................................................|..........e................................................ + // add v8.4s, v8.4s, v9.4s // .............e..................................................|.............e............................................. + // mul v9.4s, v24.4s, v1.4s // ................e...............................................|................e.......................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .................e..............................................|.................e......................................... + // mls v9.4s, v24.4s, v29.4s // .......................e........................................|.......................e................................... + // sub v24.4s, v10.4s, v11.4s // ...........e....................................................|...........e............................................... + // add v10.4s, v10.4s, v11.4s // ..............e.................................................|..............e............................................ + // mul v11.4s, v24.4s, v2.4s // ..................e.............................................|..................e........................................ + // sqrdmulh v24.4s, v24.4s, v6.4s // ...................e............................................|...................e....................................... + // mls v11.4s, v24.4s, v29.4s // ........................e.......................................|........................e.................................. + // sub v24.4s, v8.4s, v10.4s // ......................e.........................................|......................e.................................... + // add v8.4s, v8.4s, v10.4s // ............................e...................................|............................e.............................. + // mul v10.4s, v24.4s, v0.4s // ................................e...............................|................................e.......................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...........................e....................................|...........................e............................... + // mls v10.4s, v24.4s, v29.4s // .......................................e........................|.......................................e................... + // sub v24.4s, v9.4s, v11.4s // .............................e..................................|.............................e............................. + // add v9.4s, v9.4s, v11.4s // .....................................e..........................|.....................................e..................... + // mul v11.4s, v24.4s, v0.4s // .................................e..............................|.................................e......................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................e................................|...............................e........................... + // mls v11.4s, v24.4s, v29.4s // ......................................e.........................|......................................e.................... + // trn1 v25.4s, v8.4s, v9.4s // ............................................e...................|............................................e.............. + // trn2 v26.4s, v8.4s, v9.4s // ...........................................e....................|...........................................e............... + // trn1 v27.4s, v10.4s, v11.4s // .............................................e..................|.............................................e............. + // trn2 v28.4s, v10.4s, v11.4s // ..............................................e.................|..............................................e............ + // trn2 v10.2d, v25.2d, v27.2d // ..................................................e.............|..................................................e........ + // trn2 v11.2d, v26.2d, v28.2d // ....................................................e...........|....................................................e...... + // trn1 v8.2d, v25.2d, v27.2d // .................................................e..............|.................................................e......... + // trn1 v9.2d, v26.2d, v28.2d // .....................................................e..........|.....................................................e..... + // ldr q1, [x4], #8 // ............................................................e...|........................................................... + // ldr q0, [x4], #16 // ..............................................................e.|........................................................... + // sub v24.4s, v8.4s, v9.4s // ................................................................*........................................................... + // add v8.4s, v8.4s, v9.4s // ......................................................e.........|......................................................e.... + // mul v9.4s, v24.4s, v0.s[0] // .*..............................................................|.*......................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..*.............................................................|..*........................................................ + // mls v9.4s, v24.4s, v29.4s // ........*.......................................................|........*.................................................. + // sub v24.4s, v10.4s, v11.4s // .........*......................................................|.........*................................................. + // add v10.4s, v10.4s, v11.4s // .......................................................e........|.......................................................e... + // mul v11.4s, v24.4s, v0.s[2] // ....................*...........................................|....................*...................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ............*...................................................|............*.............................................. + // mls v11.4s, v24.4s, v29.4s // .........................*......................................|.........................*................................. + // sub v24.4s, v8.4s, v10.4s // .......*........................................................|.......*................................................... + // add v8.4s, v8.4s, v10.4s // .........................................................e......|.........................................................e. + // mul v10.4s, v24.4s, v1.s[0] // ...................................*............................|...................................*....................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .....................*..........................................|.....................*..................................... + // mls v10.4s, v24.4s, v29.4s // .........................................*......................|.........................................*................. + // sub v24.4s, v9.4s, v11.4s // ..................................*.............................|..................................*........................ + // add v9.4s, v9.4s, v11.4s // ..............................*.................................|..............................*............................ + // mul v11.4s, v24.4s, v1.s[0] // ................................................*...............|................................................*.......... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................................*.......................|........................................*.................. + // mls v11.4s, v24.4s, v29.4s // ........................................................*.......|........................................................*.. + // srshr v24.4S, v8.4S, #23 // ...........................................................e....|........................................................... + // mls v8.4s, v24.4s, v29.4s // .............................................................e..|........................................................... + // srshr v24.4S, v9.4S, #23 // ....................................*...........................|....................................*...................... + // mls v9.4s, v24.4s, v29.4s // ..........................................*.....................|..........................................*................ + // str q8, [x0], #(16*4) // ...............................................................e|........................................................... + // str q9, [x0, #(-16*4 + 1*16)] // ...................................................*............|...................................................*....... + // str q10, [x0, #(-16*4 + 2*16)] // ...............................................*................|...............................................*........... + // str q11, [x0, #(-16*4 + 3*16)] // ..........................................................*.....|..........................................................* + + sub count, count, #1 + cbnz count, layer5678_start + sub v10.4S, v17.4S, v9.4S // *..................... + sub v18.4S, v3.4S, v2.4S // ...*.................. + sub v22.4S, v13.4S, v28.4S // .....*................ + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mul v26.4S, v10.4S, v11.S[0] // .*.................... + sqrdmulh v10.4S, v10.4S, v11.S[1] // ..*................... + sqrdmulh v7.4S, v22.4S, v11.S[3] // ......*............... + mul v22.4S, v22.4S, v11.S[2] // .......*.............. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sqrdmulh v30.4S, v18.4S, v19.S[1] // ........*............. + mul v18.4S, v18.4S, v19.S[0] // ............*......... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v26.4S, v10.4S, v29.4S // ....*................. + mls v22.4S, v7.4S, v29.4S // .........*............ + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v18.4S, v30.4S, v29.4S // ...............*...... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sub v10.4S, v26.4S, v22.4S // ...........*.......... + add v22.4S, v26.4S, v22.4S // ..........*........... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + str q18, [x0, #-32] // .................*.... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sqrdmulh v18.4S, v10.4S, v19.S[1] // ..............*....... + mul v10.4S, v10.4S, v19.S[0] // ..................*... + srshr v26.4S, v22.4S, #23 // .............*........ + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v22.4S, v26.4S, v29.4S // ................*..... + mls v10.4S, v18.4S, v29.4S // ....................*. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + str q22, [x0, #-48] // ...................*.. + str q10, [x0, #-16] // .....................* + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + + // original source code + // sub v26.4S, v17.4S, v9.4S // *..................... + // mul v4.4S, v26.4S, v11.S[0] // ...*.................. + // sqrdmulh v8.4S, v26.4S, v11.S[1] // ....*................. + // sub v2.4S, v3.4S, v2.4S // .*.................... + // mls v4.4S, v8.4S, v29.4S // .........*............ + // sub v8.4S, v13.4S, v28.4S // ..*................... + // sqrdmulh v28.4S, v8.4S, v11.S[3] // .....*................ + // mul v23.4S, v8.4S, v11.S[2] // ......*............... + // sqrdmulh v17.4S, v2.4S, v19.S[1] // .......*.............. + // mls v23.4S, v28.4S, v29.4S // ..........*........... + // add v28.4S, v4.4S, v23.4S // .............*........ + // sub v25.4S, v4.4S, v23.4S // ............*......... + // mul v12.4S, v2.4S, v19.S[0] // ........*............. + // srshr v6.4S, v28.4S, #23 // .................*.... + // sqrdmulh v31.4S, v25.4S, v19.S[1] // ...............*...... + // mls v12.4S, v17.4S, v29.4S // ...........*.......... + // mls v28.4S, v6.4S, v29.4S // ..................*... + // str q12, [x0, #-32] // ..............*....... + // mul v12.4S, v25.4S, v19.S[0] // ................*..... + // str q28, [x0, #-48] // ....................*. + // mls v12.4S, v31.4S, v29.4S // ...................*.. + // str q12, [x0, #-16] // .....................* + + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 + ldr q11, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + ldr q9, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + ldr q12, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + ldr q22, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + ldr q20, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + ldr q18, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + ldr q27, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + ldr q24, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + ldr q21, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + ldr q15, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + ldr q14, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + sub v10.4S, v12.4S, v9.4S // ..........................*............................................................................................................................................................................................................................................................. + ldr q19, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + sub v16.4S, v11.4S, v22.4S // .....................*.................................................................................................................................................................................................................................................................. + add v8.4S, v11.4S, v22.4S // ......................*................................................................................................................................................................................................................................................................. + sub v22.4S, v20.4S, v18.4S // ................*....................................................................................................................................................................................................................................................................... + sub v11.4S, v27.4S, v24.4S // ....................................*................................................................................................................................................................................................................................................... + add v17.4S, v27.4S, v24.4S // .....................................*.................................................................................................................................................................................................................................................. + mul v13.4S, v10.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... + sqrdmulh v10.4S, v10.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... + add v24.4S, v12.4S, v9.4S // ...........................*............................................................................................................................................................................................................................................................ + add v18.4S, v20.4S, v18.4S // .................*...................................................................................................................................................................................................................................................................... + sub v28.4S, v14.4S, v21.4S // .........................................*.............................................................................................................................................................................................................................................. + sqrdmulh v23.4S, v16.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... + sqrdmulh v9.4S, v11.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + mul v20.4S, v11.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. + mul v11.4S, v16.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ + sub v12.4S, v19.4S, v15.4S // ...............................*........................................................................................................................................................................................................................................................ + add v14.4S, v14.4S, v21.4S // ..........................................*............................................................................................................................................................................................................................................. + mul v21.4S, v28.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ + sqrdmulh v28.4S, v28.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... + mls v13.4S, v10.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + ldr q10, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + ldr q16, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + sqrdmulh v27.4S, v12.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... + add v19.4S, v19.4S, v15.4S // ................................*....................................................................................................................................................................................................................................................... + mul v15.4S, v12.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... + ldr q12, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + mls v11.4S, v23.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + ldr q23, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + mls v20.4S, v9.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + sub v9.4S, v18.4S, v8.4S // ........................................................*............................................................................................................................................................................................................................... + add v8.4S, v18.4S, v8.4S // .........................................................*.............................................................................................................................................................................................................................. + mul v18.4S, v22.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... + mls v21.4S, v28.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + add v28.4S, v24.4S, v19.4S // ...................................................................*.................................................................................................................................................................................................................... + sub v24.4S, v24.4S, v19.4S // ..................................................................*..................................................................................................................................................................................................................... + sqrdmulh v19.4S, v22.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... + mls v15.4S, v27.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + sub v27.4S, v16.4S, v10.4S // ..............................................*......................................................................................................................................................................................................................................... + add v22.4S, v16.4S, v10.4S // ...............................................*........................................................................................................................................................................................................................................ + sqrdmulh v16.4S, v9.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ + mul v10.4S, v9.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. + add v9.4S, v12.4S, v23.4S // ....................................................*................................................................................................................................................................................................................................... + sub v12.4S, v12.4S, v23.4S // ...................................................*.................................................................................................................................................................................................................................... + sub v23.4S, v20.4S, v21.4S // .................................................................................*...................................................................................................................................................................................................... + mls v18.4S, v19.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + add v21.4S, v20.4S, v21.4S // ..................................................................................*..................................................................................................................................................................................................... + mls v10.4S, v16.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + sqrdmulh v19.4S, v24.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. + mul v16.4S, v24.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... + mul v24.4S, v12.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. + sqrdmulh v20.4S, v12.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. + add v12.4S, v22.4S, v9.4S // .......................................................................................*................................................................................................................................................................................................ + sub v22.4S, v22.4S, v9.4S // ......................................................................................*................................................................................................................................................................................................. + add v9.4S, v18.4S, v11.4S // ..............................................................*......................................................................................................................................................................................................................... + sub v11.4S, v18.4S, v11.4S // .............................................................*.......................................................................................................................................................................................................................... + add v18.4S, v13.4S, v15.4S // ........................................................................*............................................................................................................................................................................................................... + mls v16.4S, v19.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + mls v24.4S, v20.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + sub v15.4S, v13.4S, v15.4S // .......................................................................*................................................................................................................................................................................................................ + add v19.4S, v17.4S, v14.4S // .............................................................................*.......................................................................................................................................................................................................... + sqrdmulh v20.4S, v11.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... + sub v17.4S, v17.4S, v14.4S // ............................................................................*........................................................................................................................................................................................................... + mul v14.4S, v11.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ + mul v13.4S, v27.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... + sqrdmulh v11.4S, v27.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... + mul v27.4S, v22.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... + sqrdmulh v22.4S, v22.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. + mls v14.4S, v20.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + sqrdmulh v20.4S, v15.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. + mul v15.4S, v15.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. + mls v27.4S, v22.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + add v22.4S, v19.4S, v12.4S // .....................................................................................................................*.................................................................................................................................................................. + sub v19.4S, v19.4S, v12.4S // ....................................................................................................................*................................................................................................................................................................... + sub v12.4S, v8.4S, v28.4S // ................................................................................................*....................................................................................................................................................................................... + add v8.4S, v8.4S, v28.4S // .................................................................................................*...................................................................................................................................................................................... + sub v28.4S, v10.4S, v16.4S // ..........................................................................................................*............................................................................................................................................................................. + add v10.4S, v10.4S, v16.4S // ...........................................................................................................*............................................................................................................................................................................ + sqrdmulh v16.4S, v23.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + mul v23.4S, v23.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + mls v13.4S, v11.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + sub v11.4S, v8.4S, v22.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v22.4S // .........................................................................................................................................*.............................................................................................................................................. + mul v22.4S, v17.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... + sqrdmulh v17.4S, v17.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + mls v15.4S, v20.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + mul v20.4S, v19.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. + sqrdmulh v19.4S, v19.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ + mls v23.4S, v16.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + add v16.4S, v13.4S, v24.4S // ............................................................................................*........................................................................................................................................................................................... + mls v22.4S, v17.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + sub v24.4S, v13.4S, v24.4S // ...........................................................................................*............................................................................................................................................................................................ + sqrdmulh v13.4S, v11.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + add v17.4S, v21.4S, v16.4S // ..........................................................................................................................*............................................................................................................................................................. + sub v21.4S, v21.4S, v16.4S // .........................................................................................................................*.............................................................................................................................................................. + mul v16.4S, v11.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + sub v11.4S, v9.4S, v18.4S // .....................................................................................................*.................................................................................................................................................................................. + mls v20.4S, v19.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + add v9.4S, v9.4S, v18.4S // ......................................................................................................*................................................................................................................................................................................. + add v18.4S, v22.4S, v27.4S // ...............................................................................................................................*........................................................................................................................................................ + sub v22.4S, v22.4S, v27.4S // ..............................................................................................................................*......................................................................................................................................................... + sub v27.4S, v14.4S, v15.4S // ...............................................................................................................*........................................................................................................................................................................ + add v14.4S, v14.4S, v15.4S // ................................................................................................................*....................................................................................................................................................................... + sqrdmulh v19.4S, v12.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... + mul v12.4S, v12.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... + sub v15.4S, v9.4S, v17.4S // .............................................................................................................................................*.......................................................................................................................................... + add v9.4S, v9.4S, v17.4S // ..............................................................................................................................................*......................................................................................................................................... + sqrdmulh v17.4S, v24.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... + mul v24.4S, v24.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... + mls v16.4S, v13.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + sqrdmulh v13.4S, v22.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + mls v12.4S, v19.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + sub v19.4S, v10.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... + add v10.4S, v10.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... + mul v18.4S, v22.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + mls v24.4S, v17.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + mul v17.4S, v28.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + sqrdmulh v22.4S, v28.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + mls v18.4S, v13.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + mul v13.4S, v11.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ + sqrdmulh v11.4S, v11.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... + mls v17.4S, v22.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + sqrdmulh v22.4S, v27.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + sub v28.4S, v23.4S, v24.4S // ...................................................................................................................................*.................................................................................................................................................... + add v23.4S, v23.4S, v24.4S // ....................................................................................................................................*................................................................................................................................................... + mul v24.4S, v27.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + mul v27.4S, v21.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ + mls v13.4S, v11.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + sqrdmulh v21.4S, v21.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + sub count, count, #1 +layer1234_start: + mls v24.4S, v22.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + cmge v22.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + mls v27.4S, v21.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + cmge v21.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + add v11.4S, v14.4S, v23.4S // ........................................................................................................................................................*............................................................................................................................... + sub v23.4S, v14.4S, v23.4S // .......................................................................................................................................................*................................................................................................................................ + add v14.4S, v17.4S, v18.4S // .......................................................................................................................................................................*................................................................................................................ + sub v18.4S, v17.4S, v18.4S // ......................................................................................................................................................................*................................................................................................................. + sub v17.4S, v21.4S, v22.4S // ..................................................................................................................................................................................*..................................................................................................... + mul v22.4S, v18.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + sqrdmulh v21.4S, v18.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mul v18.4S, v19.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + mls v16.4S, v17.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sqrdmulh v17.4S, v19.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + mul v19.4S, v23.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v23.4S, v23.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mls v22.4S, v21.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + mul v21.4S, v28.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + sqrdmulh v28.4S, v28.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mls v18.4S, v17.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + mls v19.4S, v23.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + mul v17.4S, v15.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + sqrdmulh v23.4S, v15.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + sub v16.4S, v13.4S, v27.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v13.4S, v27.4S // ..................................................................................................................................................................*..................................................................................................................... + sub v27.4S, v12.4S, v20.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v12.4S, v20.4S // .............................................................................................................................................................*.......................................................................................................................... + mls v17.4S, v23.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + mls v21.4S, v28.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + cmge v15.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + cmge v20.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + cmge v28.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + cmge v23.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... + sub v15.4S, v15.4S, v20.4S // ..........................................................................................................................................................................................*............................................................................................. + mul v20.4S, v27.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + sqrdmulh v27.4S, v27.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + sub v23.4S, v23.4S, v28.4S // ..............................................................................................................................................................................................*......................................................................................... + sub v28.4S, v24.4S, v21.4S // ...........................................................................................................................................................................*............................................................................................................ + mls v18.4S, v15.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + add v15.4S, v24.4S, v21.4S // ............................................................................................................................................................................*........................................................................................................... + cmge v24.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... + cmge v21.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + mls v19.4S, v23.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mls v20.4S, v27.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + sqrdmulh v27.4S, v28.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + mul v23.4S, v28.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sub v28.4S, v24.4S, v21.4S // ..........................................................................................................................................................................................................*............................................................................. + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + mul v24.4S, v16.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + sqrdmulh v18.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + sqrdmulh v13.4S, v16.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sqrdmulh v8.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + mls v22.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mls v16.4S, v8.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sqrdmulh v8.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v23.4S, v27.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + mls v21.4S, v18.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v18.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + mls v24.4S, v13.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v13.4S, v31.4S, v20.4S // ................................................................................................................................................................................................*....................................................................................... + cmge v28.4S, v20.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v19.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + sub v8.4S, v27.4S, v18.4S // ......................................................................................................................................................................................*................................................................................................. + cmge v11.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + cmge v18.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + sub v27.4S, v13.4S, v28.4S // ..................................................................................................................................................................................................*..................................................................................... + cmge v28.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + sqrdmulh v13.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + sub v11.4S, v11.4S, v18.4S // ..............................................................................................................................................................................................................*......................................................................... + mls v17.4S, v8.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + sqrdmulh v8.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + cmge v14.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + cmge v10.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + mls v20.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + cmge v27.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + mls v23.4S, v11.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + sub v11.4S, v10.4S, v14.4S // ..............................................................................................................................................................................................................................................................*......................... + sub v10.4S, v28.4S, v27.4S // ..................................................................................................................................................................................................................................................*..................................... + str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + mls v22.4S, v13.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + str q20, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + sqrdmulh v27.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + mls v16.4S, v10.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + sqrdmulh v10.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + cmge v13.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sqrdmulh v28.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + cmge v15.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + cmge v12.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + mls v20.4S, v27.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + mls v17.4S, v10.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + cmge v10.4S, v24.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + cmge v16.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + sub v27.4S, v13.4S, v15.4S // ......................................................................................................................................................................................................................................................................*................. + mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + ldr q28, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... + mls v18.4S, v8.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + sub v13.4S, v12.4S, v16.4S // ..........................................................................................................................................................................................................................................................................*............. + ldr q12, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... + sub v15.4S, v12.4S, v28.4S // .....................e.................................................................................................................................................................................................................................................................. + cmge v9.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... + mls v19.4S, v11.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + cmge v11.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v16.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + cmge v8.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + cmge v14.4S, v31.4S, v24.4S // ....................................................................................................................................................................................................*................................................................................... + mls v22.4S, v13.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + ldr q13, [x1, #0] // e....................................................................................................................................................................................................................................................................................... + add v28.4S, v12.4S, v28.4S // ......................e................................................................................................................................................................................................................................................................. + sub v12.4S, v9.4S, v11.4S // ..............................................................................................................................................................................................................................................................................*......... + cmge v9.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + ldr q11, [x1, #64] // .e...................................................................................................................................................................................................................................................................................... + sub v8.4S, v16.4S, v8.4S // ..................................................................................................................................................................................................................................................................*..................... + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + mls v21.4S, v27.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + sub v27.4S, v14.4S, v10.4S // ......................................................................................................................................................................................................*................................................................................. + cmge v16.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + ldr q14, [x1, #320] // .....e.................................................................................................................................................................................................................................................................................. + ldr q19, [x1, #256] // ....e................................................................................................................................................................................................................................................................................... + mls v23.4S, v12.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + sub v12.4S, v9.4S, v16.4S // ......................................................................................................................................................................................................................................................*................................. + mls v20.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + mls v24.4S, v27.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + sub v16.4S, v13.4S, v11.4S // ................e....................................................................................................................................................................................................................................................................... + str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + cmge v21.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + add v9.4S, v19.4S, v14.4S // ...........................e............................................................................................................................................................................................................................................................ + add v8.4S, v13.4S, v11.4S // .................e...................................................................................................................................................................................................................................................................... + sqrdmulh v10.4S, v15.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... + str q24, [x1, #816] // .....................................................................................................................................................................................................................*.................................................................. + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + ldr q20, [x1, #384] // ......e................................................................................................................................................................................................................................................................................. + cmge v27.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... + str q23, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + sub v13.4S, v19.4S, v14.4S // ..........................e............................................................................................................................................................................................................................................................. + mul v11.4S, v15.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ + sqrdmulh v19.4S, v16.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... + ldr q15, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. + mul v16.4S, v16.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... + ldr q23, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ + ldr q22, [x1, #448] // .......e................................................................................................................................................................................................................................................................................ + sub v24.4S, v27.4S, v21.4S // ..........................................................................................................................................................................................................................................................*............................. + mls v17.4S, v12.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + add v21.4S, v8.4S, v28.4S // .........................................................e.............................................................................................................................................................................................................................. + ldr q14, [x1, #512] // ........e............................................................................................................................................................................................................................................................................... + sub v12.4S, v8.4S, v28.4S // ........................................................e............................................................................................................................................................................................................................... + ldr q28, [x1, #576] // .........e.............................................................................................................................................................................................................................................................................. + sqrdmulh v27.4S, v13.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... + mul v13.4S, v13.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... + mls v16.4S, v19.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... + mls v18.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + sqrdmulh v19.4S, v12.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ + mls v11.4S, v10.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. + str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + mls v13.4S, v27.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... + str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + add v8.4S, v20.4S, v22.4S // ................................e....................................................................................................................................................................................................................................................... + sub v17.4S, v20.4S, v22.4S // ...............................e........................................................................................................................................................................................................................................................ + mul v10.4S, v12.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. + add v24.4S, v14.4S, v28.4S // .....................................e.................................................................................................................................................................................................................................................. + sub v22.4S, v15.4S, v23.4S // .........................................e.............................................................................................................................................................................................................................................. + add v23.4S, v15.4S, v23.4S // ..........................................e............................................................................................................................................................................................................................................. + sub v28.4S, v14.4S, v28.4S // ....................................e................................................................................................................................................................................................................................................... + add v14.4S, v9.4S, v8.4S // ...................................................................e.................................................................................................................................................................................................................... + sub v27.4S, v9.4S, v8.4S // ..................................................................e..................................................................................................................................................................................................................... + mul v20.4S, v17.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... + sqrdmulh v12.4S, v17.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... + mul v8.4S, v28.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. + sqrdmulh v15.4S, v28.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ + sub v17.4S, v16.4S, v11.4S // .............................................................e.......................................................................................................................................................................................................................... + mls v10.4S, v19.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... + ldr q19, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... + ldr q28, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ + add v18.4S, v16.4S, v11.4S // ..............................................................e......................................................................................................................................................................................................................... + mul v9.4S, v27.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... + add v16.4S, v21.4S, v14.4S // .................................................................................................e...................................................................................................................................................................................... + sqrdmulh v27.4S, v27.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. + sub v11.4S, v24.4S, v23.4S // ............................................................................e........................................................................................................................................................................................................... + add v24.4S, v24.4S, v23.4S // .............................................................................e.......................................................................................................................................................................................................... + mul v23.4S, v22.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ + sqrdmulh v22.4S, v22.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... + mls v20.4S, v12.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... + mls v8.4S, v15.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... + mls v9.4S, v27.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. + ldr q27, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... + sub v12.4S, v21.4S, v14.4S // ................................................................................................e....................................................................................................................................................................................... + ldr q15, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... + sub v14.4S, v19.4S, v28.4S // ...................................................e.................................................................................................................................................................................................................................... + mls v23.4S, v22.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... + add v28.4S, v19.4S, v28.4S // ....................................................e................................................................................................................................................................................................................................... + add v19.4S, v13.4S, v20.4S // ........................................................................e............................................................................................................................................................................................................... + sub v13.4S, v13.4S, v20.4S // .......................................................................e................................................................................................................................................................................................................ + mul v21.4S, v14.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. + sqrdmulh v22.4S, v14.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. + mul v14.4S, v11.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... + sqrdmulh v20.4S, v11.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ + add v11.4S, v8.4S, v23.4S // ..................................................................................e..................................................................................................................................................................................................... + sub v23.4S, v8.4S, v23.4S // .................................................................................e...................................................................................................................................................................................................... + add v8.4S, v15.4S, v27.4S // ...............................................e........................................................................................................................................................................................................................................ + sub v27.4S, v15.4S, v27.4S // ..............................................e......................................................................................................................................................................................................................................... + mls v21.4S, v22.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ + mul v15.4S, v13.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. + sqrdmulh v22.4S, v13.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. + mls v14.4S, v20.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... + add v13.4S, v8.4S, v28.4S // .......................................................................................e................................................................................................................................................................................................ + sub v8.4S, v8.4S, v28.4S // ......................................................................................e................................................................................................................................................................................................. + sqrdmulh v28.4S, v27.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... + mul v27.4S, v27.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... + sub v20.4S, v24.4S, v13.4S // ....................................................................................................................e................................................................................................................................................................... + add v13.4S, v24.4S, v13.4S // .....................................................................................................................e.................................................................................................................................................................. + mls v15.4S, v22.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ + mls v27.4S, v28.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... + sqrdmulh v22.4S, v20.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ + mul v20.4S, v20.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. + sqrdmulh v24.4S, v23.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... + mul v23.4S, v23.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... + add v28.4S, v18.4S, v19.4S // ......................................................................................................e................................................................................................................................................................................. + sub v19.4S, v18.4S, v19.4S // .....................................................................................................e.................................................................................................................................................................................. + mul v18.4S, v8.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... + mls v20.4S, v22.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... + sub v22.4S, v27.4S, v21.4S // ...........................................................................................e............................................................................................................................................................................................ + add v27.4S, v27.4S, v21.4S // ............................................................................................e........................................................................................................................................................................................... + sqrdmulh v21.4S, v8.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. + mls v23.4S, v24.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. + mul v24.4S, v17.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ + sqrdmulh v8.4S, v17.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... + sub v17.4S, v10.4S, v9.4S // ..........................................................................................................e............................................................................................................................................................................. + add v10.4S, v10.4S, v9.4S // ...........................................................................................................e............................................................................................................................................................................ + sqrdmulh v9.4S, v22.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... + mul v22.4S, v22.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... + mls v18.4S, v21.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. + sqrdmulh v21.4S, v19.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... + mls v24.4S, v8.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... + add v8.4S, v16.4S, v13.4S // .........................................................................................................................................e.............................................................................................................................................. + sub v16.4S, v16.4S, v13.4S // ........................................................................................................................................e............................................................................................................................................... + mul v13.4S, v19.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ + sqrdmulh v19.4S, v17.4S, v0.S[3] // .............................................................................................................e.......................................................................................................................................................................... + mul v17.4S, v17.4S, v0.S[2] // ............................................................................................................e........................................................................................................................................................................... + mls v13.4S, v21.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. + mls v22.4S, v9.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ + sqrdmulh v9.4S, v16.4S, v0.S[1] // ...........................................................................................................................................e............................................................................................................................................ + mul v16.4S, v16.4S, v0.S[0] // ..........................................................................................................................................e............................................................................................................................................. + mls v17.4S, v19.4S, v29.4S // ..............................................................................................................e......................................................................................................................................................................... + add v21.4S, v14.4S, v18.4S // ...............................................................................................................................e........................................................................................................................................................ + sub v18.4S, v14.4S, v18.4S // ..............................................................................................................................e......................................................................................................................................................... + add v14.4S, v24.4S, v15.4S // ................................................................................................................e....................................................................................................................................................................... + sub v19.4S, v24.4S, v15.4S // ...............................................................................................................e........................................................................................................................................................................ + add v15.4S, v11.4S, v27.4S // ..........................................................................................................................e............................................................................................................................................................. + sqrdmulh v24.4S, v12.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... + mul v12.4S, v12.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... + mls v16.4S, v9.4S, v29.4S // ............................................................................................................................................e........................................................................................................................................... + sub v11.4S, v11.4S, v27.4S // .........................................................................................................................e.............................................................................................................................................................. + sqrdmulh v27.4S, v18.4S, v1.S[1] // .................................................................................................................................e...................................................................................................................................................... + mul v18.4S, v18.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... + add v9.4S, v28.4S, v15.4S // ..............................................................................................................................................e......................................................................................................................................... + sub v15.4S, v28.4S, v15.4S // .............................................................................................................................................e.......................................................................................................................................... + sub v28.4S, v23.4S, v22.4S // ...................................................................................................................................e.................................................................................................................................................... + add v23.4S, v23.4S, v22.4S // ....................................................................................................................................e................................................................................................................................................... + mls v12.4S, v24.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... + mul v24.4S, v19.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... + sqrdmulh v22.4S, v19.4S, v0.S[3] // ..................................................................................................................e..................................................................................................................................................................... + sub v19.4S, v10.4S, v21.4S // ..................................................................................................................................................e..................................................................................................................................... + add v10.4S, v10.4S, v21.4S // ...................................................................................................................................................e.................................................................................................................................... + sqrdmulh v21.4S, v11.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... + mls v18.4S, v27.4S, v29.4S // ..................................................................................................................................e..................................................................................................................................................... + mul v27.4S, v11.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + + // original source code + // ldr q8, [x1, #0] // ............e...............................................................................................................................................................|.......................................................................................................................e................................................. + // ldr q9, [x1, #(1*(512/8))] // ................e...........................................................................................................................................................|...........................................................................................................................e............................................. + // ldr q10, [x1, #(2*(512/8))] // ...e........................................................................................................................................................................|..............................................................................................................e.......................................................... + // ldr q11, [x1, #(3*(512/8))] // e...........................................................................................................................................................................|...........................................................................................................e............................................................. + // ldr q12, [x1, #(4*(512/8))] // .......................e....................................................................................................................................................|..................................................................................................................................e...................................... + // ldr q13, [x1, #(5*(512/8))] // ......................e.....................................................................................................................................................|.................................................................................................................................e....................................... + // ldr q14, [x1, #(6*(512/8))] // .....................................e......................................................................................................................................|................................................................................................................................................e........................ + // ldr q15, [x1, #(7*(512/8))] // ..............................................e.............................................................................................................................|.........................................................................................................................................................e............... + // ldr q16, [x1, #(8*(512/8))] // ..................................................e.........................................................................................................................|.............................................................................................................................................................e........... + // ldr q17, [x1, #(9*(512/8))] // ....................................................e.......................................................................................................................|...............................................................................................................................................................e......... + // ldr q18, [x1, #(10*(512/8))] // ...........................................e................................................................................................................................|......................................................................................................................................................e.................. + // ldr q19, [x1, #(11*(512/8))] // .............................................e..............................................................................................................................|........................................................................................................................................................e................ + // ldr q20, [x1, #(12*(512/8))] // ............................................................................................e...............................................................................|......................................................................................................................................................................... + // ldr q21, [x1, #(13*(512/8))] // ..........................................................................................e.................................................................................|......................................................................................................................................................................... + // ldr q22, [x1, #(14*(512/8))] // .............................................................................e..............................................................................................|......................................................................................................................................................................... + // ldr q23, [x1, #(15*(512/8))] // ..............................................................................e.............................................................................................|......................................................................................................................................................................... + // sub v24.4s, v8.4s, v9.4s // .............................e..............................................................................................................................................|........................................................................................................................................e................................ + // add v8.4s, v8.4s, v9.4s // .................................e..........................................................................................................................................|............................................................................................................................................e............................ + // mul v9.4s, v24.4s, v3.s[2] // ............................................e...............................................................................................................................|.......................................................................................................................................................e................. + // sqrdmulh v24.4s, v24.4s, v3.s[3] // ..........................................e.................................................................................................................................|.....................................................................................................................................................e................... + // mls v9.4s, v24.4s, v29.4s // .......................................................e....................................................................................................................|..................................................................................................................................................................e...... + // sub v24.4s, v10.4s, v11.4s // ....e.......................................................................................................................................................................|...............................................................................................................e......................................................... + // add v10.4s, v10.4s, v11.4s // .............e..............................................................................................................................................................|........................................................................................................................e................................................ + // mul v11.4s, v24.4s, v4.s[0] // .........................................e..................................................................................................................................|....................................................................................................................................................e.................... + // sqrdmulh v24.4s, v24.4s, v4.s[1] // ..................................e.........................................................................................................................................|.............................................................................................................................................e........................... + // mls v11.4s, v24.4s, v29.4s // ..........................................................e.................................................................................................................|.....................................................................................................................................................................e... + // sub v24.4s, v12.4s, v13.4s // ........................................e...................................................................................................................................|...................................................................................................................................................e..................... + // add v12.4s, v12.4s, v13.4s // ................................e...........................................................................................................................................|...........................................................................................................................................e............................. + // mul v13.4s, v24.4s, v4.s[2] // ......................................................e.....................................................................................................................|.................................................................................................................................................................e....... + // sqrdmulh v24.4s, v24.4s, v4.s[3] // .....................................................e......................................................................................................................|................................................................................................................................................................e........ + // mls v13.4s, v24.4s, v29.4s // ............................................................e...............................................................................................................|.......................................................................................................................................................................e. + // sub v24.4s, v14.4s, v15.4s // ...............................................................e............................................................................................................|......................................................................................................................................................................... + // add v14.4s, v14.4s, v15.4s // ..............................................................e.............................................................................................................|......................................................................................................................................................................... + // mul v15.4s, v24.4s, v5.s[0] // .......................................................................e....................................................................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.s[1] // ........................................................................e...................................................................................................|......................................................................................................................................................................... + // mls v15.4s, v24.4s, v29.4s // .......................................................................................e....................................................................................|......................................................................................................................................................................... + // sub v24.4s, v16.4s, v17.4s // ....................................................................e.......................................................................................................|......................................................................................................................................................................... + // add v16.4s, v16.4s, v17.4s // .................................................................e..........................................................................................................|......................................................................................................................................................................... + // mul v17.4s, v24.4s, v5.s[2] // .........................................................................e..................................................................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.s[3] // ..........................................................................e.................................................................................................|......................................................................................................................................................................... + // mls v17.4s, v24.4s, v29.4s // ........................................................................................e...................................................................................|......................................................................................................................................................................... + // sub v24.4s, v18.4s, v19.4s // ..................................................................e.........................................................................................................|......................................................................................................................................................................... + // add v18.4s, v18.4s, v19.4s // ...................................................................e........................................................................................................|......................................................................................................................................................................... + // mul v19.4s, v24.4s, v6.s[0] // .....................................................................................e......................................................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v6.s[1] // ......................................................................................e.....................................................................................|......................................................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // ..............................................................................................e.............................................................................|......................................................................................................................................................................... + // sub v24.4s, v20.4s, v21.4s // .........................................................................................................e..................................................................|......................................................................................................................................................................... + // add v20.4s, v20.4s, v21.4s // ........................................................................................................e...................................................................|......................................................................................................................................................................... + // mul v21.4s, v24.4s, v6.s[2] // .................................................................................................................e..........................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v6.s[3] // ................................................................................................................e...........................................................|......................................................................................................................................................................... + // mls v21.4s, v24.4s, v29.4s // .....................................................................................................................e......................................................|......................................................................................................................................................................... + // sub v24.4s, v22.4s, v23.4s // .............................................................................................e..............................................................................|......................................................................................................................................................................... + // add v22.4s, v22.4s, v23.4s // ...............................................................................................e............................................................................|......................................................................................................................................................................... + // mul v23.4s, v24.4s, v7.s[0] // ..................................................................................................e.........................................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v7.s[1] // ...................................................................................................e........................................................................|......................................................................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // ..........................................................................................................e.................................................................|......................................................................................................................................................................... + // sub v24.4s, v8.4s, v10.4s // ...................................................e........................................................................................................................|..............................................................................................................................................................e.......... + // add v8.4s, v8.4s, v10.4s // .................................................e..........................................................................................................................|............................................................................................................................................................e............ + // mul v10.4s, v24.4s, v1.s[2] // ................................................................e...........................................................................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .........................................................e..................................................................................................................|....................................................................................................................................................................e.... + // mls v10.4s, v24.4s, v29.4s // ............................................................................e...............................................................................................|......................................................................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ...........................................................................e................................................................................................|......................................................................................................................................................................... + // add v9.4s, v9.4s, v11.4s // ...............................................................................e............................................................................................|......................................................................................................................................................................... + // mul v11.4s, v24.4s, v1.s[2] // ..................................................................................................................................e.........................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...................................................................................................................................e........................................|......................................................................................................................................................................... + // mls v11.4s, v24.4s, v29.4s // ..........................................................................................................................................e.................................|......................................................................................................................................................................... + // sub v24.4s, v12.4s, v14.4s // ......................................................................e.....................................................................................................|......................................................................................................................................................................... + // add v12.4s, v12.4s, v14.4s // .....................................................................e......................................................................................................|......................................................................................................................................................................... + // mul v14.4s, v24.4s, v2.s[0] // ................................................................................e...........................................................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..................................................................................e.........................................................................................|......................................................................................................................................................................... + // mls v14.4s, v24.4s, v29.4s // .........................................................................................e..................................................................................|......................................................................................................................................................................... + // sub v24.4s, v13.4s, v15.4s // .................................................................................................e..........................................................................|......................................................................................................................................................................... + // add v13.4s, v13.4s, v15.4s // ................................................................................................e...........................................................................|......................................................................................................................................................................... + // mul v15.4s, v24.4s, v2.s[0] // ...........................................................................................................e................................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ............................................................................................................e...............................................................|......................................................................................................................................................................... + // mls v15.4s, v24.4s, v29.4s // ....................................................................................................................e.......................................................|......................................................................................................................................................................... + // sub v24.4s, v16.4s, v18.4s // ...................................................................................e........................................................................................|......................................................................................................................................................................... + // add v16.4s, v16.4s, v18.4s // ....................................................................................e.......................................................................................|......................................................................................................................................................................... + // mul v18.4s, v24.4s, v2.s[2] // ....................................................................................................e.......................................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // .....................................................................................................e......................................................................|......................................................................................................................................................................... + // mls v18.4s, v24.4s, v29.4s // .............................................................................................................e..............................................................|......................................................................................................................................................................... + // sub v24.4s, v17.4s, v19.4s // .......................................................................................................e....................................................................|......................................................................................................................................................................... + // add v17.4s, v17.4s, v19.4s // ......................................................................................................e.....................................................................|......................................................................................................................................................................... + // mul v19.4s, v24.4s, v2.s[2] // .........................................................................................................................e..................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ........................................................................................................................e...................................................|......................................................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // .................................................................................................................................e..........................................|......................................................................................................................................................................... + // sub v24.4s, v20.4s, v22.4s // ...............................................................................................................e............................................................|......................................................................................................................................................................... + // add v20.4s, v20.4s, v22.4s // ..............................................................................................................e.............................................................|......................................................................................................................................................................... + // mul v22.4s, v24.4s, v3.s[0] // ............................................................................................................................e...............................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................................................................................................................................e...........................................|......................................................................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ........................................................................................................................................e...................................|......................................................................................................................................................................... + // sub v24.4s, v21.4s, v23.4s // ..............................................................................................................................e.............................................|......................................................................................................................................................................... + // add v21.4s, v21.4s, v23.4s // ...............................................................................................................................e............................................|......................................................................................................................................................................... + // mul v23.4s, v24.4s, v3.s[0] // .......................................................................................................................................e....................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ......................................................................................................................................e.....................................|......................................................................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // .................................................................................................................................................e..........................|......................................................................................................................................................................... + // sub v24.4s, v8.4s, v12.4s // ...........................................................................................e................................................................................|......................................................................................................................................................................... + // add v8.4s, v8.4s, v12.4s // .................................................................................e..........................................................................................|......................................................................................................................................................................... + // mul v12.4s, v24.4s, v0.s[2] // ...........................................................................................................................................................e................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..........................................................................................................................................................e.................|......................................................................................................................................................................... + // mls v12.4s, v24.4s, v29.4s // ....................................................................................................................................................................e.......|......................................................................................................................................................................... + // sub v24.4s, v9.4s, v13.4s // ...........................................................................................................................e................................................|......................................................................................................................................................................... + // add v9.4s, v9.4s, v13.4s // ..........................................................................................................................e.................................................|......................................................................................................................................................................... + // mul v13.4s, v24.4s, v0.s[2] // .............................................................................................................................................e..............................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................................................................................................e..................................|......................................................................................................................................................................... + // mls v13.4s, v24.4s, v29.4s // ................................................................................................................................................e...........................|......................................................................................................................................................................... + // sub v24.4s, v10.4s, v14.4s // ....................................................................................................................................e.......................................|......................................................................................................................................................................... + // add v10.4s, v10.4s, v14.4s // .....................................................................................................................................e......................................|......................................................................................................................................................................... + // mul v14.4s, v24.4s, v0.s[2] // ...............................................................................................................................................e............................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..............................................................................................................................................e.............................|......................................................................................................................................................................... + // mls v14.4s, v24.4s, v29.4s // ....................................................................................................................................................e.......................|......................................................................................................................................................................... + // sub v24.4s, v11.4s, v15.4s // ........................................................................................................................................................e...................|......................................................................................................................................................................... + // add v11.4s, v11.4s, v15.4s // .......................................................................................................................................................e....................|......................................................................................................................................................................... + // mul v15.4s, v24.4s, v0.s[2] // .....................................................................................................................................................................e......|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ......................................................................................................................................................................e.....|......................................................................................................................................................................... + // mls v15.4s, v24.4s, v29.4s // ............................................................................................................................................................................*......................................................................................................................................................................... + // sub v24.4s, v16.4s, v20.4s // ..................................................................................................................e.........................................................|......................................................................................................................................................................... + // add v16.4s, v16.4s, v20.4s // ...................................................................................................................e........................................................|......................................................................................................................................................................... + // mul v20.4s, v24.4s, v1.s[0] // .......................................................................................................................e....................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................................................................................................................e.....................................................|......................................................................................................................................................................... + // mls v20.4s, v24.4s, v29.4s // .............................................................................................................................e..............................................|......................................................................................................................................................................... + // sub v24.4s, v17.4s, v21.4s // .............................................................................................................................................................e..............|......................................................................................................................................................................... + // add v17.4s, v17.4s, v21.4s // .........................................................................................................................................................e..................|......................................................................................................................................................................... + // mul v21.4s, v24.4s, v1.s[0] // ...........................................................................................................................................................................e|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................................................................................................................................................e..|......................................................................................................................................................................... + // mls v21.4s, v24.4s, v29.4s // ............................................................................................................................................................................|.*....................................................................................................................................................................... + // sub v24.4s, v18.4s, v22.4s // ......................................................................................................................................................e.....................|......................................................................................................................................................................... + // add v18.4s, v18.4s, v22.4s // .....................................................................................................................................................e......................|......................................................................................................................................................................... + // mul v22.4s, v24.4s, v1.s[0] // ...............................................................................................................................................................e............|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................................................................e.............|......................................................................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ..........................................................................................................................................................................e.|......................................................................................................................................................................... + // sub v24.4s, v19.4s, v23.4s // ..................................................................................................................................................................e.........|......................................................................................................................................................................... + // add v19.4s, v19.4s, v23.4s // ...................................................................................................................................................................e........|......................................................................................................................................................................... + // mul v23.4s, v24.4s, v1.s[0] // ............................................................................................................................................................................|................*........................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ............................................................................................................................................................................|.................*....................................................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................|............................*............................................................................................................................................ + // sub v24.4s, v8.4s, v16.4s // ............................................................................................................................................e...............................|......................................................................................................................................................................... + // add v8.4s, v8.4s, v16.4s // ...........................................................................................................................................e................................|......................................................................................................................................................................... + // mul v16.4s, v24.4s, v0.s[0] // ...................................................................................................................................................e........................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................e.........................|......................................................................................................................................................................... + // mls v16.4s, v24.4s, v29.4s // ............................................................................................................................................................e...............|......................................................................................................................................................................... + // sub v24.4s, v9.4s, v17.4s // .................................................................................................................................................................e..........|......................................................................................................................................................................... + // add v9.4s, v9.4s, v17.4s // ................................................................................................................................................................e...........|......................................................................................................................................................................... + // mul v17.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|....................*.................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|.....................*................................................................................................................................................... + // mls v17.4s, v24.4s, v29.4s // ............................................................................................................................................................................|...........................*............................................................................................................................................. + // sub v24.4s, v10.4s, v18.4s // .......................................................................................................................................................................e....|......................................................................................................................................................................... + // add v10.4s, v10.4s, v18.4s // ........................................................................................................................................................................e...|......................................................................................................................................................................... + // mul v18.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|..........*.............................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|............*............................................................................................................................................................ + // mls v18.4s, v24.4s, v29.4s // ............................................................................................................................................................................|..................*...................................................................................................................................................... + // sub v24.4s, v11.4s, v19.4s // ............................................................................................................................................................................|....*.................................................................................................................................................................... + // add v11.4s, v11.4s, v19.4s // ............................................................................................................................................................................|...*..................................................................................................................................................................... + // mul v19.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|.............*........................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|..............*.......................................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // ............................................................................................................................................................................|...................*..................................................................................................................................................... + // sub v24.4s, v12.4s, v20.4s // ............................................................................................................................................................................|.........................*............................................................................................................................................... + // add v12.4s, v12.4s, v20.4s // ............................................................................................................................................................................|..........................*.............................................................................................................................................. + // mul v20.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|..................................*...................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|...................................*..................................................................................................................................... + // mls v20.4s, v24.4s, v29.4s // ............................................................................................................................................................................|...........................................*............................................................................................................................. + // sub v24.4s, v13.4s, v21.4s // ............................................................................................................................................................................|.......................*................................................................................................................................................. + // add v13.4s, v13.4s, v21.4s // ............................................................................................................................................................................|........................*................................................................................................................................................ + // mul v21.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|................................................*........................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|...................................................*..................................................................................................................... + // mls v21.4s, v24.4s, v29.4s // ............................................................................................................................................................................|.................................................................*....................................................................................................... + // sub v24.4s, v14.4s, v22.4s // ............................................................................................................................................................................|......*.................................................................................................................................................................. + // add v14.4s, v14.4s, v22.4s // ............................................................................................................................................................................|.....*................................................................................................................................................................... + // mul v22.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|........*................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|.........*............................................................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................................|...............*......................................................................................................................................................... + // sub v24.4s, v15.4s, v23.4s // ............................................................................................................................................................................|.....................................*................................................................................................................................... + // add v15.4s, v15.4s, v23.4s // ............................................................................................................................................................................|.......................................*................................................................................................................................. + // mul v23.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|.............................................*........................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|............................................*............................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................|.........................................................*............................................................................................................... + // cmge v27.4s, v31.4s, v16.4s // ............................................................................................................................................................................|..*...................................................................................................................................................................... + // cmge v28.4s, v16.4s, v30.4s // ............................................................................................................................................................................|*........................................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|.......*................................................................................................................................................................. + // mls v16.4s, v28.4s, v29.4s // ............................................................................................................................................................................|...........*............................................................................................................................................................. + // cmge v27.4s, v31.4s, v17.4s // ............................................................................................................................................................................|.............................................................*........................................................................................................... + // cmge v28.4s, v17.4s, v30.4s // ............................................................................................................................................................................|...............................................................*......................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|.....................................................................*................................................................................................... + // mls v17.4s, v28.4s, v29.4s // ............................................................................................................................................................................|............................................................................*............................................................................................ + // cmge v27.4s, v31.4s, v18.4s // ............................................................................................................................................................................|.............................*........................................................................................................................................... + // cmge v28.4s, v18.4s, v30.4s // ............................................................................................................................................................................|..............................*.......................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|.................................*....................................................................................................................................... + // mls v18.4s, v28.4s, v29.4s // ............................................................................................................................................................................|......................................*.................................................................................................................................. + // cmge v27.4s, v31.4s, v19.4s // ............................................................................................................................................................................|................................*........................................................................................................................................ + // cmge v28.4s, v19.4s, v30.4s // ............................................................................................................................................................................|...............................*......................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|....................................*.................................................................................................................................... + // mls v19.4s, v28.4s, v29.4s // ............................................................................................................................................................................|..........................................*.............................................................................................................................. + // cmge v27.4s, v31.4s, v20.4s // ............................................................................................................................................................................|..................................................................*...................................................................................................... + // cmge v28.4s, v20.4s, v30.4s // ............................................................................................................................................................................|...................................................................*..................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|........................................................................*................................................................................................ + // mls v20.4s, v28.4s, v29.4s // ............................................................................................................................................................................|.................................................................................*....................................................................................... + // cmge v27.4s, v31.4s, v21.4s // ..........*.................................................................................................................................................................|.....................................................................................................................*................................................... + // cmge v28.4s, v21.4s, v30.4s // ............................................................................................................................................................................|.......................................................................................................*................................................................. + // sub v28.4s, v27.4s, v28.4s // ....................*.......................................................................................................................................................|...............................................................................................................................*......................................... + // mls v21.4s, v28.4s, v29.4s // ...........................*................................................................................................................................................|......................................................................................................................................*.................................. + // cmge v27.4s, v31.4s, v22.4s // ............................................................................................................................................................................|........................................*................................................................................................................................ + // cmge v28.4s, v22.4s, v30.4s // ............................................................................................................................................................................|.........................................*............................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|..............................................*.......................................................................................................................... + // mls v22.4s, v28.4s, v29.4s // ............................................................................................................................................................................|......................................................*.................................................................................................................. + // cmge v27.4s, v31.4s, v23.4s // ............................................................................................................................................................................|......................................................................*.................................................................................................. + // cmge v28.4s, v23.4s, v30.4s // ............................................................................................................................................................................|.......................................................................*................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|...........................................................................*............................................................................................. + // mls v23.4s, v28.4s, v29.4s // ............................................................................................................................................................................|...................................................................................*..................................................................................... + // str q16, [x1, #(8*(512/8))] // ............................................................................................................................................................................|......................*.................................................................................................................................................. + // str q17, [x1, #(9*(512/8))] // ............................................................................................................................................................................|......................................................................................*.................................................................................. + // str q18, [x1, #(10*(512/8))] // ............................................................................................................................................................................|.................................................*....................................................................................................................... + // str q19, [x1, #(11*(512/8))] // ............................................................................................................................................................................|...........................................................*............................................................................................................. + // str q20, [x1, #(12*(512/8))] // ............................................................................................................................................................................|........................................................................................*................................................................................ + // str q21, [x1, #(13*(512/8))] // ...................................*........................................................................................................................................|..............................................................................................................................................*.......................... + // str q22, [x1, #(14*(512/8))] // ............................................................................................................................................................................|..........................................................*.............................................................................................................. + // str q23, [x1, #(15*(512/8))] // ............................................................................................................................................................................|...............................................................................................*......................................................................... + // mul v16.4s, v8.4s, v25.4s // ............................................................................................................................................................................|....................................................*.................................................................................................................... + // sqrdmulh v8.4s, v8.4s, v26.4s // ............................................................................................................................................................................|.....................................................*................................................................................................................... + // mls v16.4s, v8.4s, v29.4s // ............................................................................................................................................................................|.......................................................*................................................................................................................. + // mul v17.4s, v9.4s, v25.4s // ............................................................................................................................................................................|..........................................................................................*.............................................................................. + // sqrdmulh v9.4s, v9.4s, v26.4s // ............................................................................................................................................................................|............................................................................................*............................................................................ + // mls v17.4s, v9.4s, v29.4s // ............................................................................................................................................................................|.....................................................................................................*................................................................... + // mul v18.4s, v10.4s, v25.4s // ............................................................................................................................................................................|.............................................................................*........................................................................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ............................................................................................................................................................................|..............................................................................*.......................................................................................... + // mls v18.4s, v10.4s, v29.4s // .*..........................................................................................................................................................................|............................................................................................................*............................................................ + // mul v19.4s, v11.4s, v25.4s // ............................................................................................................................................................................|............................................................*............................................................................................................ + // sqrdmulh v11.4s, v11.4s, v26.4s // ............................................................................................................................................................................|........................................................*................................................................................................................ + // mls v19.4s, v11.4s, v29.4s // ............................................................................................................................................................................|....................................................................*.................................................................................................... + // mul v20.4s, v12.4s, v25.4s // ............................................................................................................................................................................|.............................................................................................*........................................................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ............................................................................................................................................................................|.........................................................................................*............................................................................... + // mls v20.4s, v12.4s, v29.4s // ............................................................................................................................................................................|....................................................................................................*.................................................................... + // mul v21.4s, v13.4s, v25.4s // ............................................................................................................................................................................|...............................................*......................................................................................................................... + // sqrdmulh v13.4s, v13.4s, v26.4s // ............................................................................................................................................................................|..................................................*...................................................................................................................... + // mls v21.4s, v13.4s, v29.4s // ............................................................................................................................................................................|..............................................................*.......................................................................................................... + // mul v22.4s, v14.4s, v25.4s // ............................................................................................................................................................................|................................................................*........................................................................................................ + // sqrdmulh v14.4s, v14.4s, v26.4s // ............................................................................................................................................................................|..........................................................................*.............................................................................................. + // mls v22.4s, v14.4s, v29.4s // ............................................................................................................................................................................|.......................................................................................*................................................................................. + // mul v23.4s, v15.4s, v25.4s // ............................................................................................................................................................................|................................................................................................*........................................................................ + // sqrdmulh v15.4s, v15.4s, v26.4s // ............................................................................................................................................................................|.................................................................................................*....................................................................... + // mls v23.4s, v15.4s, v29.4s // ............................................................................................................................................................................|..........................................................................................................*.............................................................. + // cmge v27.4s, v31.4s, v16.4s // ............................................................................................................................................................................|.........................................................................*............................................................................................... + // cmge v28.4s, v16.4s, v30.4s // ............................................................................................................................................................................|..................................................................................*...................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|.....................................................................................*................................................................................... + // mls v16.4s, v28.4s, v29.4s // ............................................................................................................................................................................|...........................................................................................*............................................................................. + // cmge v27.4s, v31.4s, v17.4s // ...............*............................................................................................................................................................|..........................................................................................................................*.............................................. + // cmge v28.4s, v17.4s, v30.4s // .....................*......................................................................................................................................................|................................................................................................................................*........................................ + // sub v28.4s, v27.4s, v28.4s // .........................*..................................................................................................................................................|....................................................................................................................................*.................................... + // mls v17.4s, v28.4s, v29.4s // ................................................*...........................................................................................................................|...........................................................................................................................................................*............. + // cmge v27.4s, v31.4s, v18.4s // ......................................*.....................................................................................................................................|.................................................................................................................................................*....................... + // cmge v28.4s, v18.4s, v30.4s // ...............................*............................................................................................................................................|..........................................................................................................................................*.............................. + // sub v28.4s, v27.4s, v28.4s // ...............................................*............................................................................................................................|..........................................................................................................................................................*.............. + // mls v18.4s, v28.4s, v29.4s // ........................................................*...................................................................................................................|...................................................................................................................................................................*..... + // cmge v27.4s, v31.4s, v19.4s // ............................................................................................................................................................................|................................................................................*........................................................................................ + // cmge v28.4s, v19.4s, v30.4s // ............................................................................................................................................................................|...............................................................................*......................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|....................................................................................*.................................................................................... + // mls v19.4s, v28.4s, v29.4s // ......*.....................................................................................................................................................................|.................................................................................................................*....................................................... + // cmge v27.4s, v31.4s, v20.4s // ........*...................................................................................................................................................................|...................................................................................................................*..................................................... + // cmge v28.4s, v20.4s, v30.4s // .........*..................................................................................................................................................................|....................................................................................................................*.................................................... + // sub v28.4s, v27.4s, v28.4s // .................*..........................................................................................................................................................|............................................................................................................................*............................................ + // mls v20.4s, v28.4s, v29.4s // ..........................*.................................................................................................................................................|.....................................................................................................................................*................................... + // cmge v27.4s, v31.4s, v21.4s // ............................................................................................................................................................................|..............................................................................................*.......................................................................... + // cmge v28.4s, v21.4s, v30.4s // ............................................................................................................................................................................|..................................................................................................*...................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|.........................................................................................................*............................................................... + // mls v21.4s, v28.4s, v29.4s // ...................*........................................................................................................................................................|..............................................................................................................................*.......................................... + // cmge v27.4s, v31.4s, v22.4s // ............................................................................................................................................................................|...................................................................................................*..................................................................... + // cmge v28.4s, v22.4s, v30.4s // ............................................................................................................................................................................|........................................................................................................*................................................................ + // sub v28.4s, v27.4s, v28.4s // ..*.........................................................................................................................................................................|.............................................................................................................*........................................................... + // mls v22.4s, v28.4s, v29.4s // ...........*................................................................................................................................................................|......................................................................................................................*.................................................. + // cmge v27.4s, v31.4s, v23.4s // .....*......................................................................................................................................................................|................................................................................................................*........................................................ + // cmge v28.4s, v23.4s, v30.4s // .......*....................................................................................................................................................................|..................................................................................................................*...................................................... + // sub v28.4s, v27.4s, v28.4s // ..............*.............................................................................................................................................................|.........................................................................................................................*............................................... + // mls v23.4s, v28.4s, v29.4s // ........................*...................................................................................................................................................|...................................................................................................................................*..................................... + // str q16, [x1], #(16) // ............................................................................................................................................................................|......................................................................................................*.................................................................. + // str q17, [x1, #(-16 + 1*(512/8))] // ...........................................................*................................................................................................................|......................................................................................................................................................................*.. + // str q18, [x1, #(-16 + 2*(512/8))] // .............................................................*..............................................................................................................|........................................................................................................................................................................* + // str q19, [x1, #(-16 + 3*(512/8))] // ..................*.........................................................................................................................................................|.............................................................................................................................*........................................... + // str q20, [x1, #(-16 + 4*(512/8))] // ....................................*.......................................................................................................................................|...............................................................................................................................................*......................... + // str q21, [x1, #(-16 + 5*(512/8))] // ..............................*.............................................................................................................................................|.........................................................................................................................................*............................... + // str q22, [x1, #(-16 + 6*(512/8))] // ............................*...............................................................................................................................................|.......................................................................................................................................*................................. + // str q23, [x1, #(-16 + 7*(512/8))] // .......................................*....................................................................................................................................|..................................................................................................................................................*...................... + + sub count, count, #1 + cbnz count, layer1234_start + mls v24.4S, v22.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + mls v27.4S, v21.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + cmge v22.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + cmge v21.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + add v11.4S, v14.4S, v23.4S // ........................................................................................................................................................*............................................................................................................................... + sub v23.4S, v14.4S, v23.4S // .......................................................................................................................................................*................................................................................................................................ + add v14.4S, v17.4S, v18.4S // .......................................................................................................................................................................*................................................................................................................ + sub v17.4S, v17.4S, v18.4S // ......................................................................................................................................................................*................................................................................................................. + mul v18.4S, v19.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v19.4S, v19.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + sub v21.4S, v21.4S, v22.4S // ..................................................................................................................................................................................*..................................................................................................... + sub v22.4S, v12.4S, v20.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v12.4S, v20.4S // .............................................................................................................................................................*.......................................................................................................................... + sqrdmulh v20.4S, v28.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mul v28.4S, v28.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + mls v16.4S, v21.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sub v21.4S, v13.4S, v27.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v13.4S, v27.4S // ..................................................................................................................................................................*..................................................................................................................... + mls v18.4S, v19.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + mul v19.4S, v23.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v23.4S, v23.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mls v28.4S, v20.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mul v20.4S, v22.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + sqrdmulh v27.4S, v22.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + mul v22.4S, v17.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + sqrdmulh v16.4S, v17.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mul v17.4S, v15.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + mls v22.4S, v16.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + cmge v16.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + mls v20.4S, v27.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + cmge v27.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + mls v19.4S, v23.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + sqrdmulh v23.4S, v15.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + add v15.4S, v24.4S, v28.4S // ............................................................................................................................................................................*........................................................................................................... + sub v27.4S, v27.4S, v16.4S // ..........................................................................................................................................................................................*............................................................................................. + sub v16.4S, v24.4S, v28.4S // ...........................................................................................................................................................................*............................................................................................................ + cmge v28.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v24.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... + mls v18.4S, v27.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + sqrdmulh v27.4S, v21.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + mls v17.4S, v23.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + sub v23.4S, v24.4S, v28.4S // ..........................................................................................................................................................................................................*............................................................................. + cmge v28.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + mul v24.4S, v21.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + cmge v21.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... + mls v22.4S, v23.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mul v23.4S, v16.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sqrdmulh v16.4S, v16.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + sub v28.4S, v21.4S, v28.4S // ..............................................................................................................................................................................................*......................................................................................... + mls v24.4S, v27.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v21.4S, v31.4S, v20.4S // ................................................................................................................................................................................................*....................................................................................... + cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + cmge v18.4S, v20.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v19.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mls v23.4S, v16.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + sub v18.4S, v21.4S, v18.4S // ..................................................................................................................................................................................................*..................................................................................... + str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + cmge v19.4S, v24.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + cmge v28.4S, v31.4S, v24.4S // ....................................................................................................................................................................................................*................................................................................... + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + cmge v16.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + sqrdmulh v13.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + sqrdmulh v14.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + mls v20.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + sub v18.4S, v28.4S, v19.4S // ......................................................................................................................................................................................................*................................................................................. + sqrdmulh v28.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + sub v27.4S, v27.4S, v16.4S // ......................................................................................................................................................................................*................................................................................................. + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + cmge v19.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + cmge v8.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + mls v21.4S, v13.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + mls v24.4S, v18.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + sub v8.4S, v19.4S, v8.4S // ..............................................................................................................................................................................................................*......................................................................... + str q20, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + mls v22.4S, v14.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + sqrdmulh v14.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v17.4S, v27.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + mls v16.4S, v28.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + mls v23.4S, v8.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + cmge v8.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + sqrdmulh v27.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + str q24, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + sqrdmulh v12.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + mls v19.4S, v14.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + cmge v13.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + sqrdmulh v28.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + cmge v10.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v11.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + cmge v9.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + mls v20.4S, v27.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + sqrdmulh v27.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + mls v18.4S, v12.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + sub v10.4S, v10.4S, v13.4S // ..........................................................................................................................................................................................................................................................................*............. + cmge v24.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + mls v17.4S, v28.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + sub v13.4S, v9.4S, v11.4S // ......................................................................................................................................................................................................................................................................*................. + cmge v15.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + sub v11.4S, v8.4S, v24.4S // ..................................................................................................................................................................................................................................................*..................................... + mls v23.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + cmge v27.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + cmge v9.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + mls v22.4S, v10.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + mls v21.4S, v13.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + mls v16.4S, v11.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + sub v10.4S, v9.4S, v27.4S // ..................................................................................................................................................................................................................................................................*..................... + cmge v27.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + cmge v13.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + cmge v24.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... + cmge v8.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + mls v20.4S, v10.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + cmge v10.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + cmge v11.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v12.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... + sub v27.4S, v15.4S, v27.4S // ..............................................................................................................................................................................................................................................................*......................... + str q22, [x1, #384] // ......................................................................................................................................................................................................................................................................................*. + str q21, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + sub v22.4S, v24.4S, v13.4S // ..........................................................................................................................................................................................................................................................*............................. + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + sub v10.4S, v8.4S, v10.4S // ......................................................................................................................................................................................................................................................*................................. + sub v13.4S, v12.4S, v11.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v19.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + mls v18.4S, v22.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + mls v23.4S, v13.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + mls v17.4S, v10.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + str q23, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678.s index 8b85eb4..e505eed 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678.s @@ -371,6 +371,8 @@ _intt_dilithium_123_45678: consts .req v8 qform_consts .req q8 + modulus .req v29 + ASM_LOAD(r_ptr0, roots_l345) ASM_LOAD(r_ptr1, roots_l67) @@ -433,6 +435,12 @@ layer45678_start: gs_butterfly data4, data6, root1, 0, 1 gs_butterfly data5, data7, root1, 0, 1 + // Interm. Reduction + montg_reduce data0 + montg_reduce data1 + montg_reduce data4 + montg_reduce data5 + // Layer 4 gs_butterfly data0, data4, root0, 0, 1 gs_butterfly data1, data5, root0, 0, 1 @@ -461,12 +469,17 @@ layer45678_start: ninv .req v25 ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 ASM_LOAD(xtmp, ninv_addr) ld1r {ninv.4s}, [xtmp] ASM_LOAD(xtmp, ninv_tw_addr) ld1r {ninv_tw.4s}, [xtmp] + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + mov count, #8 ASM_LOAD(r_ptr0, roots_l012) load_roots_123 @@ -499,10 +512,10 @@ layer123_start: gs_butterfly data2, data6, root0, 0, 1 gs_butterfly data3, data7, root0, 0, 1 - montg_reduce data4 - montg_reduce data5 - montg_reduce data6 - montg_reduce data7 + canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 str_vo data4, in, (4*(1024/8)) str_vo data5, in, (5*(1024/8)) @@ -511,6 +524,11 @@ layer123_start: mul_ninv data4, data5, data6, data7, data0, data1, data2, data3 + canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 + str_vi data4, in, (16) str_vo data5, in, (-16 + 1*(1024/8)) str_vo data6, in, (-16 + 2*(1024/8)) diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4.s index c0cd992..067c375 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4.s @@ -371,6 +371,8 @@ _intt_dilithium_123_45678_manual_ld4: consts .req v8 qform_consts .req q8 + modulus .req v29 + ASM_LOAD(r_ptr0, roots_l345) ASM_LOAD(r_ptr1, roots_l67) @@ -442,6 +444,12 @@ layer45678_start: gs_butterfly data4, data6, root1, 0, 1 gs_butterfly data5, data7, root1, 0, 1 + // Interm. Reduction + montg_reduce data0 + montg_reduce data1 + montg_reduce data4 + montg_reduce data5 + // Layer 4 gs_butterfly data0, data4, root0, 0, 1 gs_butterfly data1, data5, root0, 0, 1 @@ -470,12 +478,17 @@ layer45678_start: ninv .req v25 ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 ASM_LOAD(xtmp, ninv_addr) ld1r {ninv.4s}, [xtmp] ASM_LOAD(xtmp, ninv_tw_addr) ld1r {ninv_tw.4s}, [xtmp] + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + mov count, #8 ASM_LOAD(r_ptr0, roots_l012) load_roots_123 @@ -508,10 +521,10 @@ layer123_start: gs_butterfly data2, data6, root0, 0, 1 gs_butterfly data3, data7, root0, 0, 1 - montg_reduce data4 - montg_reduce data5 - montg_reduce data6 - montg_reduce data7 + canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 str_vo data4, in, (4*(1024/8)) str_vo data5, in, (5*(1024/8)) @@ -520,6 +533,11 @@ layer123_start: mul_ninv data4, data5, data6, data7, data0, data1, data2, data3 + canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 + str_vi data4, in, (16) str_vo data5, in, (-16 + 1*(1024/8)) str_vo data6, in, (-16 + 2*(1024/8)) diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a55.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a55.s new file mode 100644 index 0000000..41059cf --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a55.s @@ -0,0 +1,2136 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_manual_ld4_opt_a55 + .global _intt_dilithium_123_45678_manual_ld4_opt_a55 + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_manual_ld4_opt_a55: +_intt_dilithium_123_45678_manual_ld4_opt_a55: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + ldr q22, [x4, #48] // ........................* + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q23, [x1, #32] // .....*................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q16, [x1, #0] // ...*..................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q2, [x1, #16] // ....*.................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q25, [x5, #32] // ..*...................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q11, [x5, #48] // ............*............ + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q10, [x5, #80] // ..............*.......... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q19, [x5, #96] // ...............*......... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q6, [x5, #128] // .................*....... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q18, [x5, #144] // ..................*...... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q27, [x4], #64 // .....................*... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q4, [x2, #48] // ...........*............. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q3, [x5, #64] // .............*........... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q21, [x1, #48] // ......*.................. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q1, [x5, #112] // ................*........ + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q29, [x5, #160] // ...................*..... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q28, [x4, #-48] // ......................*.. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q5, [x4, #-32] // .......................*. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q26, [x2, #0] // .......*................. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q17, [x2, #16] // .........*............... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q30, [x2, #32] // ..........*.............. + // gap // ......................... + // gap // ......................... + // gap // ......................... + trn1 v20.4S, v23.4S, v21.4S // ........*................ + // gap // ......................... + ldr q24, [x5, #176] // ....................*.... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q7, [x5, #16] // .*....................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q13, [x5], #(12*16) // *........................ + // gap // ......................... + + // original source code + // ldr q13, [x5], #(12*16) // ........................* + // ldr q7, [x5, #-176] // .......................*. + // ldr q25, [x5, #-160] // ....*.................... + // ldr q16, [x1, #0] // ..*...................... + // ldr q2, [x1, #16] // ...*..................... + // ldr q23, [x1, #32] // .*....................... + // ldr q21, [x1, #48] // .............*........... + // ldr q26, [x2, #0] // ..................*...... + // trn1 v20.4S, v23.4S, v21.4S // .....................*... + // ldr q17, [x2, #16] // ...................*..... + // ldr q30, [x2, #32] // ....................*.... + // ldr q4, [x2, #48] // ...........*............. + // ldr q11, [x5, #-144] // .....*................... + // ldr q3, [x5, #-128] // ............*............ + // ldr q10, [x5, #-112] // ......*.................. + // ldr q19, [x5, #-96] // .......*................. + // ldr q1, [x5, #-80] // ..............*.......... + // ldr q6, [x5, #-64] // ........*................ + // ldr q18, [x5, #-48] // .........*............... + // ldr q29, [x5, #-32] // ...............*......... + // ldr q24, [x5, #-16] // ......................*.. + // ldr q27, [x4], #64 // ..........*.............. + // ldr q28, [x4, #-48] // ................*........ + // ldr q5, [x4, #-32] // .................*....... + // ldr q22, [x4, #-16] // *........................ + + sub count, count, #1 +layer45678_start: + trn1 v0.4S, v16.4S, v2.4S // ....*................................................................................................................................................................. + // gap // ...................................................................................................................................................................... + trn2 v2.4S, v16.4S, v2.4S // .....*................................................................................................................................................................ + // gap // ...................................................................................................................................................................... + trn2 v16.4S, v23.4S, v21.4S // .......*.............................................................................................................................................................. + // gap // ...................................................................................................................................................................... + trn2 v23.2D, v0.2D, v20.2D // ........*............................................................................................................................................................. + // gap // ...................................................................................................................................................................... + trn1 v0.2D, v0.2D, v20.2D // ..........*........................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn2 v21.2D, v2.2D, v16.2D // .........*............................................................................................................................................................ + // gap // ...................................................................................................................................................................... + trn1 v2.2D, v2.2D, v16.2D // ...........*.......................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v16.4S, v23.4S, v21.4S // ...................................*.................................................................................................................................. + // gap // ...................................................................................................................................................................... + add v23.4S, v23.4S, v21.4S // ....................................*................................................................................................................................. + // gap // ...................................................................................................................................................................... + sub v21.4S, v0.4S, v2.4S // ..............................*....................................................................................................................................... + // gap // ...................................................................................................................................................................... + add v0.4S, v0.4S, v2.4S // ...............................*...................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v2.4S, v26.4S, v17.4S // ................*..................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn2 v26.4S, v26.4S, v17.4S // .................*.................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v20.4S, v30.4S, v4.4S // ..................*................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn2 v17.4S, v30.4S, v4.4S // ...................*.................................................................................................................................................. + // gap // ...................................................................................................................................................................... + mul v30.4S, v16.4S, v3.4S // .....................................*................................................................................................................................ + // gap // ...................................................................................................................................................................... + sqrdmulh v16.4S, v16.4S, v10.4S // ......................................*............................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v4.4S, v0.4S, v23.4S // ........................................*............................................................................................................................. + // gap // ...................................................................................................................................................................... + add v0.4S, v0.4S, v23.4S // .........................................*............................................................................................................................ + // gap // ...................................................................................................................................................................... + mul v23.4S, v21.4S, v25.4S // ................................*..................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v21.4S, v21.4S, v11.4S // .................................*.................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn2 v25.2D, v2.2D, v20.2D // ....................*................................................................................................................................................. + // gap // ...................................................................................................................................................................... + trn2 v11.2D, v26.2D, v17.2D // .....................*................................................................................................................................................ + // gap // ...................................................................................................................................................................... + trn1 v2.2D, v2.2D, v20.2D // ......................*............................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v26.2D, v26.2D, v17.2D // .......................*.............................................................................................................................................. + // gap // ...................................................................................................................................................................... + mls v23.4S, v21.4S, v8.S[0] // ..................................*................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v30.4S, v16.4S, v8.S[0] // .......................................*.............................................................................................................................. + // gap // ...................................................................................................................................................................... + mul v16.4S, v4.4S, v13.4S // ..........................................*........................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v21.4S, v4.4S, v7.4S // ...........................................*.......................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v20.4S, v2.4S, v26.4S // ........................................................*............................................................................................................. + // gap // ...................................................................................................................................................................... + sub v17.4S, v23.4S, v30.4S // .............................................*........................................................................................................................ + // gap // ...................................................................................................................................................................... + add v23.4S, v23.4S, v30.4S // ..............................................*....................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v16.4S, v21.4S, v8.S[0] // ............................................*......................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v21.4S, v17.4S, v13.4S // ...............................................*...................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v17.4S, v17.4S, v7.4S // ................................................*..................................................................................................................... + // gap // ...................................................................................................................................................................... + add v2.4S, v2.4S, v26.4S // .........................................................*............................................................................................................ + // gap // ...................................................................................................................................................................... + mul v26.4S, v20.4S, v6.4S // ..........................................................*........................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v30.4S, v0.4S, v23.4S // ............................................................................*......................................................................................... + // gap // ...................................................................................................................................................................... + trn2 v0.4S, v0.4S, v23.4S // .............................................................................*........................................................................................ + // gap // ...................................................................................................................................................................... + mls v21.4S, v17.4S, v8.S[0] // .................................................*.................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v23.4S, v20.4S, v18.4S // ...........................................................*.......................................................................................................... + // gap // ...................................................................................................................................................................... + sub v20.4S, v25.4S, v11.4S // .............................................................*........................................................................................................ + // gap // ...................................................................................................................................................................... + add v17.4S, v25.4S, v11.4S // ..............................................................*....................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v4.4S, v16.4S, v21.4S // ..............................................................................*....................................................................................... + // gap // ...................................................................................................................................................................... + mls v26.4S, v23.4S, v8.S[0] // ............................................................*......................................................................................................... + // gap // ...................................................................................................................................................................... + mul v23.4S, v20.4S, v29.4S // ...............................................................*...................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v20.4S, v20.4S, v24.4S // ................................................................*..................................................................................................... + // gap // ...................................................................................................................................................................... + sub v25.4S, v2.4S, v17.4S // ..................................................................*................................................................................................... + // gap // ...................................................................................................................................................................... + add v2.4S, v2.4S, v17.4S // ...................................................................*.................................................................................................. + // gap // ...................................................................................................................................................................... + trn2 v16.4S, v16.4S, v21.4S // ...............................................................................*...................................................................................... + // gap // ...................................................................................................................................................................... + mls v23.4S, v20.4S, v8.S[0] // .................................................................*.................................................................................................... + // gap // ...................................................................................................................................................................... + mul v21.4S, v25.4S, v19.4S // ....................................................................*................................................................................................. + // gap // ...................................................................................................................................................................... + sqrdmulh v20.4S, v25.4S, v1.4S // .....................................................................*................................................................................................ + // gap // ...................................................................................................................................................................... + trn2 v17.2D, v30.2D, v4.2D // ................................................................................*..................................................................................... + // gap // ...................................................................................................................................................................... + sub v25.4S, v26.4S, v23.4S // .......................................................................*.............................................................................................. + // gap // ...................................................................................................................................................................... + add v23.4S, v26.4S, v23.4S // ........................................................................*............................................................................................. + // gap // ...................................................................................................................................................................... + mls v21.4S, v20.4S, v8.S[0] // ......................................................................*............................................................................................... + // gap // ...................................................................................................................................................................... + mul v26.4S, v25.4S, v19.4S // .........................................................................*............................................................................................ + // gap // ...................................................................................................................................................................... + sqrdmulh v20.4S, v25.4S, v1.4S // ..........................................................................*........................................................................................... + // gap // ...................................................................................................................................................................... + trn2 v25.2D, v0.2D, v16.2D // .................................................................................*.................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v30.2D, v30.2D, v4.2D // ..................................................................................*................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v0.2D, v0.2D, v16.2D // ...................................................................................*.................................................................................. + // gap // ...................................................................................................................................................................... + mls v26.4S, v20.4S, v8.S[0] // ...........................................................................*.......................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v16.4S, v2.4S, v23.4S // ....................................................................................*................................................................................. + // gap // ...................................................................................................................................................................... + trn2 v2.4S, v2.4S, v23.4S // .....................................................................................*................................................................................ + // gap // ...................................................................................................................................................................... + sub v23.4S, v30.4S, v0.4S // ................................................................................................*..................................................................... + // gap // ...................................................................................................................................................................... + trn1 v20.4S, v21.4S, v26.4S // ......................................................................................*............................................................................... + // gap // ...................................................................................................................................................................... + trn2 v21.4S, v21.4S, v26.4S // .......................................................................................*.............................................................................. + // gap // ...................................................................................................................................................................... + add v0.4S, v30.4S, v0.4S // .................................................................................................*.................................................................... + // gap // ...................................................................................................................................................................... + trn2 v26.2D, v16.2D, v20.2D // ........................................................................................*............................................................................. + // gap // ...................................................................................................................................................................... + trn2 v30.2D, v2.2D, v21.2D // .........................................................................................*............................................................................ + // gap // ...................................................................................................................................................................... + trn1 v16.2D, v16.2D, v20.2D // ..........................................................................................*........................................................................... + // gap // ...................................................................................................................................................................... + trn1 v2.2D, v2.2D, v21.2D // ...........................................................................................*.......................................................................... + // gap // ...................................................................................................................................................................... + mul v21.4S, v23.4S, v28.S[2] // ..................................................................................................*................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v23.4S, v23.4S, v28.S[3] // ...................................................................................................*.................................................................. + // gap // ...................................................................................................................................................................... + sub v20.4S, v17.4S, v25.4S // .....................................................................................................*................................................................ + // gap // ...................................................................................................................................................................... + add v17.4S, v17.4S, v25.4S // ......................................................................................................*............................................................... + // gap // ...................................................................................................................................................................... + sub v4.4S, v16.4S, v2.4S // ..........................................................................................................*........................................................... + // gap // ...................................................................................................................................................................... + mls v21.4S, v23.4S, v8.S[0] // ....................................................................................................*................................................................. + // gap // ...................................................................................................................................................................... + mul v23.4S, v20.4S, v5.S[0] // .......................................................................................................*.............................................................. + // gap // ...................................................................................................................................................................... + sqrdmulh v20.4S, v20.4S, v5.S[1] // ........................................................................................................*............................................................. + // gap // ...................................................................................................................................................................... + add v2.4S, v16.4S, v2.4S // ...........................................................................................................*.......................................................... + // gap // ...................................................................................................................................................................... + mul v16.4S, v4.4S, v5.S[2] // ............................................................................................................*......................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v4.4S, v4.4S, v5.S[3] // .............................................................................................................*........................................................ + // gap // ...................................................................................................................................................................... + mls v23.4S, v20.4S, v8.S[0] // .........................................................................................................*............................................................ + // gap // ...................................................................................................................................................................... + sub v20.4S, v26.4S, v30.4S // ...............................................................................................................*...................................................... + // gap // ...................................................................................................................................................................... + add v26.4S, v26.4S, v30.4S // ................................................................................................................*..................................................... + // gap // ...................................................................................................................................................................... + mls v16.4S, v4.4S, v8.S[0] // ..............................................................................................................*....................................................... + // gap // ...................................................................................................................................................................... + mul v30.4S, v20.4S, v22.S[0] // .................................................................................................................*.................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v20.4S, v20.4S, v22.S[1] // ..................................................................................................................*................................................... + // gap // ...................................................................................................................................................................... + sub v4.4S, v0.4S, v17.4S // ....................................................................................................................*................................................. + // gap // ...................................................................................................................................................................... + add v0.4S, v0.4S, v17.4S // .....................................................................................................................*................................................ + // gap // ...................................................................................................................................................................... + sub v17.4S, v21.4S, v23.4S // .........................................................................................................................*............................................ + // gap // ...................................................................................................................................................................... + mls v30.4S, v20.4S, v8.S[0] // ...................................................................................................................*.................................................. + // gap // ...................................................................................................................................................................... + mul v20.4S, v4.4S, v27.S[2] // ......................................................................................................................*............................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v4.4S, v4.4S, v27.S[3] // .......................................................................................................................*.............................................. + // gap // ...................................................................................................................................................................... + add v23.4S, v21.4S, v23.4S // ..........................................................................................................................*........................................... + // gap // ...................................................................................................................................................................... + mul v21.4S, v17.4S, v27.S[2] // ...........................................................................................................................*.......................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v17.4S, v17.4S, v27.S[3] // ............................................................................................................................*......................................... + // gap // ...................................................................................................................................................................... + mls v20.4S, v4.4S, v8.S[0] // ........................................................................................................................*............................................. + // gap // ...................................................................................................................................................................... + sub v4.4S, v2.4S, v26.4S // ..............................................................................................................................*....................................... + // gap // ...................................................................................................................................................................... + add v2.4S, v2.4S, v26.4S // ...............................................................................................................................*...................................... + // gap // ...................................................................................................................................................................... + mls v21.4S, v17.4S, v8.S[0] // .............................................................................................................................*........................................ + // gap // ...................................................................................................................................................................... + mul v26.4S, v4.4S, v28.S[0] // ................................................................................................................................*..................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v17.4S, v4.4S, v28.S[1] // .................................................................................................................................*.................................... + // gap // ...................................................................................................................................................................... + sub v4.4S, v16.4S, v30.4S // ...................................................................................................................................*.................................. + // gap // ...................................................................................................................................................................... + add v16.4S, v16.4S, v30.4S // ....................................................................................................................................*................................. + // gap // ...................................................................................................................................................................... + sub v30.4S, v0.4S, v2.4S // ........................................................................................................................................*............................. + // gap // ...................................................................................................................................................................... + mls v26.4S, v17.4S, v8.S[0] // ..................................................................................................................................*................................... + // gap // ...................................................................................................................................................................... + mul v17.4S, v4.4S, v28.S[0] // .....................................................................................................................................*................................ + // gap // ...................................................................................................................................................................... + sqrdmulh v4.4S, v4.4S, v28.S[1] // ......................................................................................................................................*............................... + // gap // ...................................................................................................................................................................... + add v0.4S, v0.4S, v2.4S // .........................................................................................................................................*............................ + // gap // ...................................................................................................................................................................... + mul v2.4S, v30.4S, v27.S[0] // ..........................................................................................................................................*........................... + // gap // ...................................................................................................................................................................... + sqrdmulh v30.4S, v30.4S, v27.S[1] // ...........................................................................................................................................*.......................... + // gap // ...................................................................................................................................................................... + mls v17.4S, v4.4S, v8.S[0] // .......................................................................................................................................*.............................. + // gap // ...................................................................................................................................................................... + sub v4.4S, v23.4S, v16.4S // .............................................................................................................................................*........................ + // gap // ...................................................................................................................................................................... + add v16.4S, v23.4S, v16.4S // ..............................................................................................................................................*....................... + // gap // ...................................................................................................................................................................... + mls v2.4S, v30.4S, v8.S[0] // ............................................................................................................................................*......................... + // gap // ...................................................................................................................................................................... + mul v23.4S, v4.4S, v27.S[0] // ...............................................................................................................................................*...................... + // gap // ...................................................................................................................................................................... + sqrdmulh v30.4S, v4.4S, v27.S[1] // ................................................................................................................................................*..................... + // gap // ...................................................................................................................................................................... + sub v4.4S, v20.4S, v26.4S // ..................................................................................................................................................*................... + // gap // ...................................................................................................................................................................... + add v26.4S, v20.4S, v26.4S // ...................................................................................................................................................*.................. + // gap // ...................................................................................................................................................................... + sub v20.4S, v21.4S, v17.4S // .......................................................................................................................................................*.............. + // gap // ...................................................................................................................................................................... + mls v23.4S, v30.4S, v8.S[0] // .................................................................................................................................................*.................... + // gap // ...................................................................................................................................................................... + mul v30.4S, v4.4S, v27.S[0] // ....................................................................................................................................................*................. + // gap // ...................................................................................................................................................................... + sqrdmulh v4.4S, v4.4S, v27.S[1] // .....................................................................................................................................................*................ + // gap // ...................................................................................................................................................................... + add v21.4S, v21.4S, v17.4S // ........................................................................................................................................................*............. + // gap // ...................................................................................................................................................................... + mul v17.4S, v20.4S, v27.S[0] // .........................................................................................................................................................*............ + // gap // ...................................................................................................................................................................... + sqrdmulh v20.4S, v20.4S, v27.S[1] // ..........................................................................................................................................................*........... + // gap // ...................................................................................................................................................................... + mls v30.4S, v4.4S, v8.S[0] // ......................................................................................................................................................*............... + // gap // ...................................................................................................................................................................... + str q0, [x1], #(16*4) // ............................................................................................................................................................*......... + // gap // ...................................................................................................................................................................... + ldr q13, [x5], #(12*16) // ........................e............................................................................................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v17.4S, v20.4S, v8.S[0] // ...........................................................................................................................................................*.......... + // gap // ...................................................................................................................................................................... + str q16, [x1, #-48] // .............................................................................................................................................................*........ + // gap // ...................................................................................................................................................................... + ldr q7, [x5, #-176] // .........................e............................................................................................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + str q26, [x1, #-32] // ..............................................................................................................................................................*....... + // gap // ...................................................................................................................................................................... + ldr q25, [x5, #-160] // ..........................e........................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + str q21, [x1, #-16] // ...............................................................................................................................................................*...... + add x1, x1, #64 // ....................................................................................................................................................................*. + ldr q16, [x1, #0] // e..................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + str q2, [x2], #(16*4) // ................................................................................................................................................................*..... + // gap // ...................................................................................................................................................................... + ldr q2, [x1, #16] // .e.................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + str q23, [x2, #-48] // .................................................................................................................................................................*.... + // gap // ...................................................................................................................................................................... + ldr q23, [x1, #32] // ..e................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + str q30, [x2, #-32] // ..................................................................................................................................................................*... + // gap // ...................................................................................................................................................................... + ldr q21, [x1, #48] // ...e.................................................................................................................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + str q17, [x2, #-16] // ...................................................................................................................................................................*.. + add x2, x2, #64 // .....................................................................................................................................................................* + ldr q26, [x2, #0] // ............e......................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v20.4S, v23.4S, v21.4S // ......e............................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q17, [x2, #16] // .............e........................................................................................................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q30, [x2, #32] // ..............e....................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q4, [x2, #48] // ...............e...................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q11, [x5, #-144] // ...........................e.......................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q3, [x5, #-128] // ............................e......................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q10, [x5, #-112] // .............................e........................................................................................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q19, [x5, #-96] // ..................................................e................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q1, [x5, #-80] // ...................................................e.................................................................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q6, [x5, #-64] // ....................................................e................................................................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q18, [x5, #-48] // .....................................................e................................................................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q29, [x5, #-32] // ......................................................e............................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q24, [x5, #-16] // .......................................................e.............................................................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q27, [x4], #64 // ............................................................................................e......................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q28, [x4, #-48] // .............................................................................................e........................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q5, [x4, #-32] // ..............................................................................................e....................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q22, [x4, #-16] // ...............................................................................................e...................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + + // original source code + // ldr q9, [x1, #0] // ........e..........................|..........................................................................................................................................e........ + // ldr q10, [x1, #16] // ..........e........................|............................................................................................................................................e...... + // ldr q11, [x1, #32] // ............e......................|..............................................................................................................................................e.... + // ldr q12, [x1, #48] // ..............e....................|................................................................................................................................................e.. + // trn1 v25.4s, v9.4s, v10.4s // ...................................*................................................................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ...................................|*.................................................................................................................................................. + // trn1 v27.4s, v11.4s, v12.4s // ..................e................|................................................................................................................................................... + // trn2 v28.4s, v11.4s, v12.4s // ...................................|.*................................................................................................................................................. + // trn2 v11.2d, v25.2d, v27.2d // ...................................|..*................................................................................................................................................ + // trn2 v12.2d, v26.2d, v28.2d // ...................................|....*.............................................................................................................................................. + // trn1 v9.2d, v25.2d, v27.2d // ...................................|...*............................................................................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ...................................|.....*............................................................................................................................................. + // ldr q13, [x2, #0] // .................e.................|................................................................................................................................................... + // ldr q14, [x2, #16] // ...................e...............|................................................................................................................................................... + // ldr q15, [x2, #32] // ....................e..............|................................................................................................................................................... + // ldr q16, [x2, #48] // .....................e.............|................................................................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ...................................|..........*........................................................................................................................................ + // trn2 v26.4s, v13.4s, v14.4s // ...................................|...........*....................................................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ...................................|............*...................................................................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // ...................................|.............*..................................................................................................................................... + // trn2 v15.2d, v25.2d, v27.2d // ...................................|....................*.............................................................................................................................. + // trn2 v16.2d, v26.2d, v28.2d // ...................................|.....................*............................................................................................................................. + // trn1 v13.2d, v25.2d, v27.2d // ...................................|......................*............................................................................................................................ + // trn1 v14.2d, v26.2d, v28.2d // ...................................|.......................*........................................................................................................................... + // ldr q0, [x5], #(12*16) // e..................................|..................................................................................................................................e................ + // ldr q4, [x5, #(-12*16 + 1*16)] // ...e...............................|.....................................................................................................................................e............. + // ldr q1, [x5, #(-12*16 + 2*16)] // .....e.............................|.......................................................................................................................................e........... + // ldr q5, [x5, #(-12*16 + 3*16)] // ......................e............|................................................................................................................................................... + // ldr q2, [x5, #(-12*16 + 4*16)] // .......................e...........|................................................................................................................................................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ........................e..........|................................................................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ...................................|........*.......................................................................................................................................... + // add v9.4s, v9.4s, v10.4s // ...................................|.........*......................................................................................................................................... + // mul v10.4s, v24.4s, v1.4s // ...................................|..................*................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v5.4s // ...................................|...................*............................................................................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ...................................|........................*.......................................................................................................................... + // sub v24.4s, v11.4s, v12.4s // ...................................|......*............................................................................................................................................ + // add v11.4s, v11.4s, v12.4s // ...................................|.......*........................................................................................................................................... + // mul v12.4s, v24.4s, v2.4s // ...................................|..............*.................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ...................................|...............*................................................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................................|.........................*......................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ...................................|................*.................................................................................................................................. + // add v9.4s, v9.4s, v11.4s // ...................................|.................*................................................................................................................................. + // mul v11.4s, v24.4s, v0.4s // ...................................|..........................*........................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|...........................*....................................................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ...................................|...............................*................................................................................................................... + // sub v24.4s, v10.4s, v12.4s // ...................................|.............................*..................................................................................................................... + // add v10.4s, v10.4s, v12.4s // ...................................|..............................*.................................................................................................................... + // mul v12.4s, v24.4s, v0.4s // ...................................|................................*.................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|.................................*................................................................................................................. + // mls v12.4s, v24.4s, v8.s[0] // ...................................|......................................*............................................................................................................ + // ldr q0, [x5, #(-12*16 + 6*16)] // .........................e.........|................................................................................................................................................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ..........................e........|................................................................................................................................................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ...........................e.......|................................................................................................................................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // ............................e......|................................................................................................................................................... + // ldr q2, [x5, #(-12*16 + 10*16)] // .............................e.....|................................................................................................................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ..............................e....|................................................................................................................................................... + // sub v24.4s, v13.4s, v14.4s // ...................................|............................*...................................................................................................................... + // add v13.4s, v13.4s, v14.4s // ...................................|..................................*................................................................................................................ + // mul v14.4s, v24.4s, v1.4s // ...................................|...................................*............................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ...................................|.......................................*........................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ...................................|...........................................*....................................................................................................... + // sub v24.4s, v15.4s, v16.4s // ...................................|........................................*.......................................................................................................... + // add v15.4s, v15.4s, v16.4s // ...................................|.........................................*......................................................................................................... + // mul v16.4s, v24.4s, v2.4s // ...................................|............................................*...................................................................................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ...................................|.............................................*..................................................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ...................................|.................................................*................................................................................................. + // sub v24.4s, v13.4s, v15.4s // ...................................|..............................................*.................................................................................................... + // add v13.4s, v13.4s, v15.4s // ...................................|...............................................*................................................................................................... + // mul v15.4s, v24.4s, v0.4s // ...................................|..................................................*................................................................................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|...................................................*............................................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ...................................|.......................................................*........................................................................................... + // sub v24.4s, v14.4s, v16.4s // ...................................|.....................................................*............................................................................................. + // add v14.4s, v14.4s, v16.4s // ...................................|......................................................*............................................................................................ + // mul v16.4s, v24.4s, v0.4s // ...................................|........................................................*.......................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|.........................................................*......................................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ...................................|.............................................................*..................................................................................... + // trn1 v25.4s, v9.4s, v10.4s // ...................................|....................................*.............................................................................................................. + // trn2 v26.4s, v9.4s, v10.4s // ...................................|.....................................*............................................................................................................. + // trn1 v27.4s, v11.4s, v12.4s // ...................................|..........................................*........................................................................................................ + // trn2 v28.4s, v11.4s, v12.4s // ...................................|................................................*.................................................................................................. + // trn2 v11.2d, v25.2d, v27.2d // ...................................|....................................................*.............................................................................................. + // trn2 v12.2d, v26.2d, v28.2d // ...................................|..........................................................*........................................................................................ + // trn1 v9.2d, v25.2d, v27.2d // ...................................|...........................................................*....................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ...................................|............................................................*...................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ...................................|..............................................................*.................................................................................... + // trn2 v26.4s, v13.4s, v14.4s // ...................................|...............................................................*................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ...................................|.................................................................*................................................................................. + // trn2 v28.4s, v15.4s, v16.4s // ...................................|..................................................................*................................................................................ + // trn2 v15.2d, v25.2d, v27.2d // ...................................|....................................................................*.............................................................................. + // trn2 v16.2d, v26.2d, v28.2d // ...................................|.....................................................................*............................................................................. + // trn1 v13.2d, v25.2d, v27.2d // ...................................|......................................................................*............................................................................ + // trn1 v14.2d, v26.2d, v28.2d // ...................................|.......................................................................*........................................................................... + // ldr q0, [x4], #64 // ...............................e...|................................................................................................................................................... + // ldr q1, [x4, #(-64 + 16)] // ................................e..|................................................................................................................................................... + // ldr q2, [x4, #(-64 + 32)] // .................................e.|................................................................................................................................................... + // ldr q3, [x4, #(-64 + 48)] // ..................................e|................................................................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ...................................|................................................................*.................................................................................. + // add v9.4s, v9.4s, v10.4s // ...................................|...................................................................*............................................................................... + // mul v10.4s, v24.4s, v1.s[2] // ...................................|........................................................................*.......................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...................................|.........................................................................*......................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ...................................|.............................................................................*..................................................................... + // sub v24.4s, v11.4s, v12.4s // ...................................|..........................................................................*........................................................................ + // add v11.4s, v11.4s, v12.4s // ...................................|...........................................................................*....................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ...................................|..............................................................................*.................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...................................|...............................................................................*................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................................|...................................................................................*............................................................... + // sub v24.4s, v13.4s, v14.4s // ...................................|............................................................................*...................................................................... + // add v13.4s, v13.4s, v14.4s // ...................................|................................................................................*.................................................................. + // mul v14.4s, v24.4s, v2.s[2] // ...................................|.................................................................................*................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...................................|..................................................................................*................................................................ + // mls v14.4s, v24.4s, v8.s[0] // ...................................|......................................................................................*............................................................ + // sub v24.4s, v15.4s, v16.4s // ...................................|....................................................................................*.............................................................. + // add v15.4s, v15.4s, v16.4s // ...................................|.....................................................................................*............................................................. + // mul v16.4s, v24.4s, v3.s[0] // ...................................|.......................................................................................*........................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...................................|........................................................................................*.......................................................... + // mls v16.4s, v24.4s, v8.s[0] // ...................................|............................................................................................*...................................................... + // sub v24.4s, v9.4s, v11.4s // ...................................|.........................................................................................*......................................................... + // add v9.4s, v9.4s, v11.4s // ...................................|..........................................................................................*........................................................ + // mul v11.4s, v24.4s, v0.s[2] // ...................................|.............................................................................................*..................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................|..............................................................................................*.................................................... + // mls v11.4s, v24.4s, v8.s[0] // ...................................|..................................................................................................*................................................ + // sub v24.4s, v10.4s, v12.4s // ...................................|...........................................................................................*....................................................... + // add v10.4s, v10.4s, v12.4s // ...................................|...............................................................................................*................................................... + // mul v12.4s, v24.4s, v0.s[2] // ...................................|................................................................................................*.................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................|.................................................................................................*................................................. + // mls v12.4s, v24.4s, v8.s[0] // ...................................|.....................................................................................................*............................................. + // sub v24.4s, v13.4s, v15.4s // ...................................|...................................................................................................*............................................... + // add v13.4s, v13.4s, v15.4s // ...................................|....................................................................................................*.............................................. + // mul v15.4s, v24.4s, v1.s[0] // ...................................|......................................................................................................*............................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................|.......................................................................................................*........................................... + // mls v15.4s, v24.4s, v8.s[0] // ...................................|...........................................................................................................*....................................... + // sub v24.4s, v14.4s, v16.4s // ...................................|........................................................................................................*.......................................... + // add v14.4s, v14.4s, v16.4s // ...................................|.........................................................................................................*......................................... + // mul v16.4s, v24.4s, v1.s[0] // ...................................|............................................................................................................*...................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................|.............................................................................................................*..................................... + // mls v16.4s, v24.4s, v8.s[0] // ...................................|.................................................................................................................*................................. + // sub v24.4s, v9.4s, v13.4s // ...................................|..........................................................................................................*........................................ + // add v9.4s, v9.4s, v13.4s // ...................................|..............................................................................................................*.................................... + // mul v13.4s, v24.4s, v0.s[0] // ...................................|...............................................................................................................*................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|................................................................................................................*.................................. + // mls v13.4s, v24.4s, v8.s[0] // ...................................|....................................................................................................................*.............................. + // sub v24.4s, v10.4s, v14.4s // ...................................|..................................................................................................................*................................ + // add v10.4s, v10.4s, v14.4s // ...................................|...................................................................................................................*............................... + // mul v14.4s, v24.4s, v0.s[0] // ...................................|.....................................................................................................................*............................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|......................................................................................................................*............................ + // mls v14.4s, v24.4s, v8.s[0] // ...................................|..........................................................................................................................*........................ + // sub v24.4s, v11.4s, v15.4s // ...................................|.......................................................................................................................*........................... + // add v11.4s, v11.4s, v15.4s // ...................................|........................................................................................................................*.......................... + // mul v15.4s, v24.4s, v0.s[0] // ...................................|...........................................................................................................................*....................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|............................................................................................................................*...................... + // mls v15.4s, v24.4s, v8.s[0] // ...................................|................................................................................................................................*.................. + // sub v24.4s, v12.4s, v16.4s // ...................................|.........................................................................................................................*......................... + // add v12.4s, v12.4s, v16.4s // ...................................|.............................................................................................................................*..................... + // mul v16.4s, v24.4s, v0.s[0] // ...................................|..............................................................................................................................*.................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|...............................................................................................................................*................... + // mls v16.4s, v24.4s, v8.s[0] // .*.................................|...................................................................................................................................*............... + // str q9, [x1], #(16*4) // ...................................|.................................................................................................................................*................. + // str q10, [x1, #(-16*4 + 1*16)] // ..*................................|....................................................................................................................................*.............. + // str q11, [x1, #(-16*4 + 2*16)] // ....*..............................|......................................................................................................................................*............ + // str q12, [x1, #(-16*4 + 3*16)] // ......*............................|........................................................................................................................................*.......... + // str q13, [x2], #(16*4) // .........*.........................|...........................................................................................................................................*....... + // str q14, [x2, #(-16*4 + 1*16)] // ...........*.......................|.............................................................................................................................................*..... + // str q15, [x2, #(-16*4 + 2*16)] // .............*.....................|...............................................................................................................................................*... + // str q16, [x2, #(-16*4 + 3*16)] // ...............*...................|.................................................................................................................................................*. + // add x1, x1, #64 // .......*...........................|.........................................................................................................................................*......... + // add x2, x2, #64 // ................*..................|..................................................................................................................................................* + + sub count, count, #1 + cbnz count, layer45678_start + trn2 v0.4S, v30.4S, v4.4S // ..............*.............................................................................................................................. + // gap // ............................................................................................................................................. + trn1 v9.4S, v30.4S, v4.4S // .............*............................................................................................................................... + // gap // ............................................................................................................................................. + trn2 v15.4S, v26.4S, v17.4S // ............*................................................................................................................................ + // gap // ............................................................................................................................................. + trn1 v14.4S, v26.4S, v17.4S // ...........*................................................................................................................................. + // gap // ............................................................................................................................................. + trn1 v30.4S, v16.4S, v2.4S // *............................................................................................................................................ + // gap // ............................................................................................................................................. + trn2 v4.4S, v23.4S, v21.4S // ..*.......................................................................................................................................... + // gap // ............................................................................................................................................. + trn2 v26.4S, v16.4S, v2.4S // .*........................................................................................................................................... + // gap // ............................................................................................................................................. + trn1 v23.2D, v30.2D, v20.2D // ....*........................................................................................................................................ + // gap // ............................................................................................................................................. + trn1 v2.2D, v15.2D, v0.2D // ........................*.................................................................................................................... + // gap // ............................................................................................................................................. + trn1 v12.2D, v26.2D, v4.2D // ......*...................................................................................................................................... + // gap // ............................................................................................................................................. + trn2 v16.2D, v15.2D, v0.2D // ......................*...................................................................................................................... + // gap // ............................................................................................................................................. + trn2 v0.2D, v14.2D, v9.2D // .....................*....................................................................................................................... + // gap // ............................................................................................................................................. + sub v21.4S, v23.4S, v12.4S // .........*................................................................................................................................... + // gap // ............................................................................................................................................. + sub v15.4S, v0.4S, v16.4S // .........................................*................................................................................................... + // gap // ............................................................................................................................................. + add v31.4S, v23.4S, v12.4S // ..........*.................................................................................................................................. + // gap // ............................................................................................................................................. + add v17.4S, v0.4S, v16.4S // ..........................................*.................................................................................................. + // gap // ............................................................................................................................................. + sqrdmulh v16.4S, v21.4S, v11.4S // ....................*........................................................................................................................ + // gap // ............................................................................................................................................. + mul v25.4S, v21.4S, v25.4S // ...................*......................................................................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v23.4S, v15.4S, v24.4S // ..............................................*.............................................................................................. + // gap // ............................................................................................................................................. + trn1 v21.2D, v14.2D, v9.2D // .......................*..................................................................................................................... + // gap // ............................................................................................................................................. + trn2 v11.2D, v26.2D, v4.2D // .....*....................................................................................................................................... + // gap // ............................................................................................................................................. + add v0.4S, v21.4S, v2.4S // ...................................*......................................................................................................... + // gap // ............................................................................................................................................. + mls v25.4S, v16.4S, v8.S[0] // .........................*................................................................................................................... + // gap // ............................................................................................................................................. + sub v9.4S, v21.4S, v2.4S // .............................*............................................................................................................... + // gap // ............................................................................................................................................. + add v21.4S, v0.4S, v17.4S // ................................................*............................................................................................ + // gap // ............................................................................................................................................. + sub v0.4S, v0.4S, v17.4S // ...............................................*............................................................................................. + // gap // ............................................................................................................................................. + trn2 v12.2D, v30.2D, v20.2D // ...*......................................................................................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v4.4S, v9.4S, v18.4S // ........................................*.................................................................................................... + // gap // ............................................................................................................................................. + mul v30.4S, v0.4S, v19.4S // ...................................................*......................................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v16.4S, v0.4S, v1.4S // ....................................................*........................................................................................ + // gap // ............................................................................................................................................. + sub v18.4S, v12.4S, v11.4S // .......*..................................................................................................................................... + // gap // ............................................................................................................................................. + mul v20.4S, v9.4S, v6.4S // ....................................*........................................................................................................ + // gap // ............................................................................................................................................. + mul v26.4S, v15.4S, v29.4S // .............................................*............................................................................................... + // gap // ............................................................................................................................................. + mul v6.4S, v18.4S, v3.4S // ...............*............................................................................................................................. + // gap // ............................................................................................................................................. + sqrdmulh v17.4S, v18.4S, v10.4S // ................*............................................................................................................................ + // gap // ............................................................................................................................................. + mls v20.4S, v4.4S, v8.S[0] // ............................................*................................................................................................ + // gap // ............................................................................................................................................. + mls v26.4S, v23.4S, v8.S[0] // ..................................................*.......................................................................................... + // gap // ............................................................................................................................................. + add v4.4S, v12.4S, v11.4S // ........*.................................................................................................................................... + // gap // ............................................................................................................................................. + mls v6.4S, v17.4S, v8.S[0] // ..........................*.................................................................................................................. + // gap // ............................................................................................................................................. + mls v30.4S, v16.4S, v8.S[0] // ........................................................*.................................................................................... + // gap // ............................................................................................................................................. + sub v16.4S, v20.4S, v26.4S // ......................................................*...................................................................................... + // gap // ............................................................................................................................................. + sub v17.4S, v31.4S, v4.4S // .................*........................................................................................................................... + // gap // ............................................................................................................................................. + sub v23.4S, v25.4S, v6.4S // ..............................*.............................................................................................................. + // gap // ............................................................................................................................................. + mul v0.4S, v16.4S, v19.4S // .........................................................*................................................................................... + // gap // ............................................................................................................................................. + mul v19.4S, v17.4S, v13.4S // ...........................*................................................................................................................. + // gap // ............................................................................................................................................. + sqrdmulh v2.4S, v23.4S, v7.4S // ..................................*.......................................................................................................... + // gap // ............................................................................................................................................. + mul v3.4S, v23.4S, v13.4S // .................................*........................................................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v16.4S, v16.4S, v1.4S // ..........................................................*.................................................................................. + // gap // ............................................................................................................................................. + add v26.4S, v20.4S, v26.4S // .......................................................*..................................................................................... + // gap // ............................................................................................................................................. + add v14.4S, v25.4S, v6.4S // ...............................*............................................................................................................. + // gap // ............................................................................................................................................. + mls v3.4S, v2.4S, v8.S[0] // .......................................*..................................................................................................... + // gap // ............................................................................................................................................. + mls v0.4S, v16.4S, v8.S[0] // ..............................................................*.............................................................................. + // gap // ............................................................................................................................................. + sqrdmulh v10.4S, v17.4S, v7.4S // ............................*................................................................................................................ + // gap // ............................................................................................................................................. + trn1 v2.4S, v21.4S, v26.4S // ...............................................................*............................................................................. + // gap // ............................................................................................................................................. + add v20.4S, v31.4S, v4.4S // ..................*.......................................................................................................................... + // gap // ............................................................................................................................................. + trn1 v6.4S, v30.4S, v0.4S // ..................................................................*.......................................................................... + // gap // ............................................................................................................................................. + mls v19.4S, v10.4S, v8.S[0] // ................................*............................................................................................................ + // gap // ............................................................................................................................................. + trn2 v23.4S, v30.4S, v0.4S // ...................................................................*......................................................................... + // gap // ............................................................................................................................................. + trn1 v11.2D, v2.2D, v6.2D // .......................................................................*..................................................................... + // gap // ............................................................................................................................................. + trn2 v12.4S, v21.4S, v26.4S // ................................................................*............................................................................ + // gap // ............................................................................................................................................. + trn1 v21.4S, v19.4S, v3.4S // ...........................................*................................................................................................. + // gap // ............................................................................................................................................. + trn2 v30.4S, v19.4S, v3.4S // .................................................*........................................................................................... + // gap // ............................................................................................................................................. + trn2 v16.4S, v20.4S, v14.4S // ......................................*...................................................................................................... + // gap // ............................................................................................................................................. + trn1 v26.2D, v12.2D, v23.2D // ........................................................................*.................................................................... + // gap // ............................................................................................................................................. + trn2 v17.2D, v2.2D, v6.2D // .....................................................................*....................................................................... + // gap // ............................................................................................................................................. + trn1 v13.2D, v16.2D, v30.2D // .............................................................*............................................................................... + // gap // ............................................................................................................................................. + sub v0.4S, v11.4S, v26.4S // .............................................................................*............................................................... + // gap // ............................................................................................................................................. + trn1 v25.4S, v20.4S, v14.4S // .....................................*....................................................................................................... + // gap // ............................................................................................................................................. + trn2 v7.2D, v16.2D, v30.2D // ...........................................................*................................................................................. + // gap // ............................................................................................................................................. + sqrdmulh v2.4S, v0.4S, v5.S[3] // ...................................................................................*......................................................... + // gap // ............................................................................................................................................. + mul v20.4S, v0.4S, v5.S[2] // ..................................................................................*.......................................................... + // gap // ............................................................................................................................................. + trn2 v3.2D, v25.2D, v21.2D // .....................................................*....................................................................................... + // gap // ............................................................................................................................................. + trn1 v10.2D, v25.2D, v21.2D // ............................................................*................................................................................ + // gap // ............................................................................................................................................. + sub v16.4S, v3.4S, v7.4S // ...........................................................................*................................................................. + // gap // ............................................................................................................................................. + mls v20.4S, v2.4S, v8.S[0] // .......................................................................................*..................................................... + // gap // ............................................................................................................................................. + sub v30.4S, v10.4S, v13.4S // .................................................................*........................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v0.4S, v16.4S, v5.S[1] // ................................................................................*............................................................ + // gap // ............................................................................................................................................. + mul v16.4S, v16.4S, v5.S[0] // ...............................................................................*............................................................. + // gap // ............................................................................................................................................. + sqrdmulh v21.4S, v30.4S, v28.S[3] // ..........................................................................*.................................................................. + // gap // ............................................................................................................................................. + trn2 v2.2D, v12.2D, v23.2D // ......................................................................*...................................................................... + // gap // ............................................................................................................................................. + mul v25.4S, v30.4S, v28.S[2] // .........................................................................*................................................................... + // gap // ............................................................................................................................................. + sub v23.4S, v17.4S, v2.4S // .....................................................................................*....................................................... + // gap // ............................................................................................................................................. + add v17.4S, v17.4S, v2.4S // ......................................................................................*...................................................... + // gap // ............................................................................................................................................. + mls v16.4S, v0.4S, v8.S[0] // ....................................................................................*........................................................ + // gap // ............................................................................................................................................. + sqrdmulh v2.4S, v23.4S, v22.S[1] // .........................................................................................*................................................... + // gap // ............................................................................................................................................. + mul v23.4S, v23.4S, v22.S[0] // ........................................................................................*.................................................... + // gap // ............................................................................................................................................. + add v4.4S, v11.4S, v26.4S // .................................................................................*........................................................... + // gap // ............................................................................................................................................. + mls v25.4S, v21.4S, v8.S[0] // ..............................................................................*.............................................................. + // gap // ............................................................................................................................................. + add v3.4S, v3.4S, v7.4S // ............................................................................*................................................................ + // gap // ............................................................................................................................................. + mls v23.4S, v2.4S, v8.S[0] // .............................................................................................*............................................... + // gap // ............................................................................................................................................. + add v30.4S, v4.4S, v17.4S // .....................................................................................................*....................................... + // gap // ............................................................................................................................................. + sub v26.4S, v25.4S, v16.4S // ............................................................................................*................................................ + // gap // ............................................................................................................................................. + add v2.4S, v25.4S, v16.4S // ................................................................................................*............................................ + // gap // ............................................................................................................................................. + add v16.4S, v20.4S, v23.4S // ..........................................................................................................*.................................. + // gap // ............................................................................................................................................. + sqrdmulh v21.4S, v26.4S, v27.S[3] // ..................................................................................................*.......................................... + // gap // ............................................................................................................................................. + mul v26.4S, v26.4S, v27.S[2] // .................................................................................................*........................................... + // gap // ............................................................................................................................................. + add v0.4S, v2.4S, v16.4S // ....................................................................................................................*........................ + // gap // ............................................................................................................................................. + add v25.4S, v10.4S, v13.4S // ....................................................................*........................................................................ + // gap // ............................................................................................................................................. + sub v23.4S, v20.4S, v23.4S // .........................................................................................................*................................... + // gap // ............................................................................................................................................. + str q0, [x1, #16] // ....................................................................................................................................*........ + // gap // ............................................................................................................................................. + mls v26.4S, v21.4S, v8.S[0] // ......................................................................................................*...................................... + // gap // ............................................................................................................................................. + sub v20.4S, v25.4S, v3.4S // ..........................................................................................*.................................................. + // gap // ............................................................................................................................................. + sqrdmulh v0.4S, v23.4S, v28.S[1] // ..............................................................................................................*.............................. + // gap // ............................................................................................................................................. + mul v23.4S, v23.4S, v28.S[0] // .............................................................................................................*............................... + // gap // ............................................................................................................................................. + sqrdmulh v21.4S, v20.4S, v27.S[3] // ...............................................................................................*............................................. + // gap // ............................................................................................................................................. + sub v17.4S, v4.4S, v17.4S // ....................................................................................................*........................................ + // gap // ............................................................................................................................................. + mul v11.4S, v20.4S, v27.S[2] // ..............................................................................................*.............................................. + // gap // ............................................................................................................................................. + mls v23.4S, v0.4S, v8.S[0] // ..................................................................................................................*.......................... + // gap // ............................................................................................................................................. + sub v16.4S, v2.4S, v16.4S // ...................................................................................................................*......................... + // gap // ............................................................................................................................................. + sqrdmulh v20.4S, v17.4S, v28.S[1] // ........................................................................................................*.................................... + // gap // ............................................................................................................................................. + mul v17.4S, v17.4S, v28.S[0] // .......................................................................................................*..................................... + // gap // ............................................................................................................................................. + sub v4.4S, v26.4S, v23.4S // ..........................................................................................................................*.................. + // gap // ............................................................................................................................................. + add v10.4S, v26.4S, v23.4S // ..............................................................................................................................*.............. + // gap // ............................................................................................................................................. + mls v11.4S, v21.4S, v8.S[0] // ...................................................................................................*......................................... + // gap // ............................................................................................................................................. + sqrdmulh v2.4S, v4.4S, v27.S[1] // ................................................................................................................................*............ + // gap // ............................................................................................................................................. + mul v0.4S, v4.4S, v27.S[0] // ...............................................................................................................................*............. + // gap // ............................................................................................................................................. + mls v17.4S, v20.4S, v8.S[0] // ............................................................................................................*................................ + // gap // ............................................................................................................................................. + sqrdmulh v23.4S, v16.4S, v27.S[1] // .......................................................................................................................*..................... + // gap // ............................................................................................................................................. + mul v26.4S, v16.4S, v27.S[0] // ......................................................................................................................*...................... + // gap // ............................................................................................................................................. + add v25.4S, v25.4S, v3.4S // ...........................................................................................*................................................. + // gap // ............................................................................................................................................. + sub v21.4S, v11.4S, v17.4S // ........................................................................................................................*.................... + // gap // ............................................................................................................................................. + add v16.4S, v11.4S, v17.4S // .........................................................................................................................*................... + // gap // ............................................................................................................................................. + mls v26.4S, v23.4S, v8.S[0] // ...........................................................................................................................*................. + // gap // ............................................................................................................................................. + sqrdmulh v29.4S, v21.4S, v27.S[1] // .............................................................................................................................*............... + // gap // ............................................................................................................................................. + sub v4.4S, v25.4S, v30.4S // ...........................................................................................................*................................. + // gap // ............................................................................................................................................. + add v17.4S, v25.4S, v30.4S // ...............................................................................................................*............................. + // gap // ............................................................................................................................................. + str q16, [x1, #32] // .....................................................................................................................................*....... + // gap // ............................................................................................................................................. + sqrdmulh v20.4S, v4.4S, v27.S[1] // .................................................................................................................*........................... + // gap // ............................................................................................................................................. + mul v23.4S, v4.4S, v27.S[0] // ................................................................................................................*............................ + // gap // ............................................................................................................................................. + str q17, [x1], #(16*4) // ..................................................................................................................................*.......... + // gap // ............................................................................................................................................. + mul v16.4S, v21.4S, v27.S[0] // ............................................................................................................................*................ + // gap // ............................................................................................................................................. + str q10, [x1, #-16] // ......................................................................................................................................*...... + add x1, x1, #64 // .......................................................................................................................................*..... + mls v23.4S, v20.4S, v8.S[0] // .....................................................................................................................*....................... + // gap // ............................................................................................................................................. + str q26, [x2, #16] // .........................................................................................................................................*... + // gap // ............................................................................................................................................. + mls v16.4S, v29.4S, v8.S[0] // .................................................................................................................................*........... + // gap // ............................................................................................................................................. + mls v0.4S, v2.4S, v8.S[0] // ...................................................................................................................................*......... + // gap // ............................................................................................................................................. + str q23, [x2], #(16*4) // ........................................................................................................................................*.... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + str q16, [x2, #-32] // ..........................................................................................................................................*.. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + str q0, [x2, #-16] // ...........................................................................................................................................*. + add x2, x2, #64 // ............................................................................................................................................* + + // original source code + // trn1 v0.4S, v16.4S, v2.4S // ....*........................................................................................................................................ + // trn2 v2.4S, v16.4S, v2.4S // ......*...................................................................................................................................... + // trn2 v16.4S, v23.4S, v21.4S // .....*....................................................................................................................................... + // trn2 v23.2D, v0.2D, v20.2D // ..........................*.................................................................................................................. + // trn1 v0.2D, v0.2D, v20.2D // .......*..................................................................................................................................... + // trn2 v21.2D, v2.2D, v16.2D // ....................*........................................................................................................................ + // trn1 v2.2D, v2.2D, v16.2D // .........*................................................................................................................................... + // sub v16.4S, v23.4S, v21.4S // ..............................*.............................................................................................................. + // add v23.4S, v23.4S, v21.4S // .....................................*....................................................................................................... + // sub v21.4S, v0.4S, v2.4S // ............*................................................................................................................................ + // add v0.4S, v0.4S, v2.4S // ..............*.............................................................................................................................. + // trn1 v2.4S, v26.4S, v17.4S // ...*......................................................................................................................................... + // trn2 v26.4S, v26.4S, v17.4S // ..*.......................................................................................................................................... + // trn1 v20.4S, v30.4S, v4.4S // .*........................................................................................................................................... + // trn2 v17.4S, v30.4S, v4.4S // *............................................................................................................................................ + // mul v30.4S, v16.4S, v3.4S // .................................*........................................................................................................... + // sqrdmulh v16.4S, v16.4S, v10.4S // ..................................*.......................................................................................................... + // sub v4.4S, v0.4S, v23.4S // .........................................*................................................................................................... + // add v0.4S, v0.4S, v23.4S // ......................................................*...................................................................................... + // mul v23.4S, v21.4S, v25.4S // .................*........................................................................................................................... + // sqrdmulh v21.4S, v21.4S, v11.4S // ................*............................................................................................................................ + // trn2 v25.2D, v2.2D, v20.2D // ...........*................................................................................................................................. + // trn2 v11.2D, v26.2D, v17.2D // ..........*.................................................................................................................................. + // trn1 v2.2D, v2.2D, v20.2D // ...................*......................................................................................................................... + // trn1 v26.2D, v26.2D, v17.2D // ........*.................................................................................................................................... + // mls v23.4S, v21.4S, v8.S[0] // ......................*...................................................................................................................... + // mls v30.4S, v16.4S, v8.S[0] // ......................................*...................................................................................................... + // mul v16.4S, v4.4S, v13.4S // ............................................*................................................................................................ + // sqrdmulh v21.4S, v4.4S, v7.4S // ....................................................*........................................................................................ + // sub v20.4S, v2.4S, v26.4S // .......................*..................................................................................................................... + // sub v17.4S, v23.4S, v30.4S // ..........................................*.................................................................................................. + // add v23.4S, v23.4S, v30.4S // .................................................*........................................................................................... + // mls v16.4S, v21.4S, v8.S[0] // ........................................................*.................................................................................... + // mul v21.4S, v17.4S, v13.4S // ..............................................*.............................................................................................. + // sqrdmulh v17.4S, v17.4S, v7.4S // .............................................*............................................................................................... + // add v2.4S, v2.4S, v26.4S // .....................*....................................................................................................................... + // mul v26.4S, v20.4S, v6.4S // ...............................*............................................................................................................. + // trn1 v30.4S, v0.4S, v23.4S // ...................................................................*......................................................................... + // trn2 v0.4S, v0.4S, v23.4S // ..............................................................*.............................................................................. + // mls v21.4S, v17.4S, v8.S[0] // ..................................................*.......................................................................................... + // sqrdmulh v23.4S, v20.4S, v18.4S // ...........................*................................................................................................................. + // sub v20.4S, v25.4S, v11.4S // .............*............................................................................................................................... + // add v17.4S, v25.4S, v11.4S // ...............*............................................................................................................................. + // trn1 v4.4S, v16.4S, v21.4S // ............................................................*................................................................................ + // mls v26.4S, v23.4S, v8.S[0] // ...................................*......................................................................................................... + // mul v23.4S, v20.4S, v29.4S // ................................*............................................................................................................ + // sqrdmulh v20.4S, v20.4S, v24.4S // ..................*.......................................................................................................................... + // sub v25.4S, v2.4S, v17.4S // .........................*................................................................................................................... + // add v2.4S, v2.4S, v17.4S // ........................*.................................................................................................................... + // trn2 v16.4S, v16.4S, v21.4S // .............................................................*............................................................................... + // mls v23.4S, v20.4S, v8.S[0] // ....................................*........................................................................................................ + // mul v21.4S, v25.4S, v19.4S // ............................*................................................................................................................ + // sqrdmulh v20.4S, v25.4S, v1.4S // .............................*............................................................................................................... + // trn2 v17.2D, v30.2D, v4.2D // .......................................................................*..................................................................... + // sub v25.4S, v26.4S, v23.4S // ........................................*.................................................................................................... + // add v23.4S, v26.4S, v23.4S // ................................................*............................................................................................ + // mls v21.4S, v20.4S, v8.S[0] // .......................................*..................................................................................................... + // mul v26.4S, v25.4S, v19.4S // ...........................................*................................................................................................. + // sqrdmulh v20.4S, v25.4S, v1.4S // ...............................................*............................................................................................. + // trn2 v25.2D, v0.2D, v16.2D // ....................................................................*........................................................................ + // trn1 v30.2D, v30.2D, v4.2D // ........................................................................*.................................................................... + // trn1 v0.2D, v0.2D, v16.2D // .................................................................*........................................................................... + // mls v26.4S, v20.4S, v8.S[0] // ...................................................*......................................................................................... + // trn1 v16.4S, v2.4S, v23.4S // .....................................................*....................................................................................... + // trn2 v2.4S, v2.4S, v23.4S // ...........................................................*................................................................................. + // sub v23.4S, v30.4S, v0.4S // ...........................................................................*................................................................. + // trn1 v20.4S, v21.4S, v26.4S // .......................................................*..................................................................................... + // trn2 v21.4S, v21.4S, v26.4S // .........................................................*................................................................................... + // add v0.4S, v30.4S, v0.4S // .................................................................................................*........................................... + // trn2 v26.2D, v16.2D, v20.2D // ................................................................*............................................................................ + // trn2 v30.2D, v2.2D, v21.2D // ...............................................................................*............................................................. + // trn1 v16.2D, v16.2D, v20.2D // ..........................................................*.................................................................................. + // trn1 v2.2D, v2.2D, v21.2D // ...............................................................*............................................................................. + // mul v21.4S, v23.4S, v28.S[2] // ................................................................................*............................................................ + // sqrdmulh v23.4S, v23.4S, v28.S[3] // ..............................................................................*.............................................................. + // sub v20.4S, v17.4S, v25.4S // .........................................................................*................................................................... + // add v17.4S, v17.4S, v25.4S // ........................................................................................*.................................................... + // sub v4.4S, v16.4S, v2.4S // ..................................................................*.......................................................................... + // mls v21.4S, v23.4S, v8.S[0] // .......................................................................................*..................................................... + // mul v23.4S, v20.4S, v5.S[0] // .............................................................................*............................................................... + // sqrdmulh v20.4S, v20.4S, v5.S[1] // ............................................................................*................................................................ + // add v2.4S, v16.4S, v2.4S // ......................................................................................*...................................................... + // mul v16.4S, v4.4S, v5.S[2] // ......................................................................*...................................................................... + // sqrdmulh v4.4S, v4.4S, v5.S[3] // .....................................................................*....................................................................... + // mls v23.4S, v20.4S, v8.S[0] // ...................................................................................*......................................................... + // sub v20.4S, v26.4S, v30.4S // .................................................................................*........................................................... + // add v26.4S, v26.4S, v30.4S // ..................................................................................*.......................................................... + // mls v16.4S, v4.4S, v8.S[0] // ..........................................................................*.................................................................. + // mul v30.4S, v20.4S, v22.S[0] // .....................................................................................*....................................................... + // sqrdmulh v20.4S, v20.4S, v22.S[1] // ....................................................................................*........................................................ + // sub v4.4S, v0.4S, v17.4S // .....................................................................................................*....................................... + // add v0.4S, v0.4S, v17.4S // .......................................................................................................................*..................... + // sub v17.4S, v21.4S, v23.4S // ...........................................................................................*................................................. + // mls v30.4S, v20.4S, v8.S[0] // .........................................................................................*................................................... + // mul v20.4S, v4.4S, v27.S[2] // ..........................................................................................................*.................................. + // sqrdmulh v4.4S, v4.4S, v27.S[3] // ........................................................................................................*.................................... + // add v23.4S, v21.4S, v23.4S // ............................................................................................*................................................ + // mul v21.4S, v17.4S, v27.S[2] // ...............................................................................................*............................................. + // sqrdmulh v17.4S, v17.4S, v27.S[3] // ..............................................................................................*.............................................. + // mls v20.4S, v4.4S, v8.S[0] // .................................................................................................................*........................... + // sub v4.4S, v2.4S, v26.4S // .........................................................................................................*................................... + // add v2.4S, v2.4S, v26.4S // ..........................................................................................*.................................................. + // mls v21.4S, v17.4S, v8.S[0] // ....................................................................................................*........................................ + // mul v26.4S, v4.4S, v28.S[0] // ..............................................................................................................*.............................. + // sqrdmulh v17.4S, v4.4S, v28.S[1] // .............................................................................................................*............................... + // sub v4.4S, v16.4S, v30.4S // ..................................................................................................*.......................................... + // add v16.4S, v16.4S, v30.4S // .............................................................................................*............................................... + // sub v30.4S, v0.4S, v2.4S // ............................................................................................................................*................ + // mls v26.4S, v17.4S, v8.S[0] // ....................................................................................................................*........................ + // mul v17.4S, v4.4S, v28.S[0] // .......................................................................................................*..................................... + // sqrdmulh v4.4S, v4.4S, v28.S[1] // ......................................................................................................*...................................... + // add v0.4S, v0.4S, v2.4S // .............................................................................................................................*............... + // mul v2.4S, v30.4S, v27.S[0] // ................................................................................................................................*............ + // sqrdmulh v30.4S, v30.4S, v27.S[1] // ...............................................................................................................................*............. + // mls v17.4S, v4.4S, v8.S[0] // ...........................................................................................................*................................. + // sub v4.4S, v23.4S, v16.4S // ............................................................................................................*................................ + // add v16.4S, v23.4S, v16.4S // ................................................................................................*............................................ + // mls v2.4S, v30.4S, v8.S[0] // .....................................................................................................................................*....... + // mul v23.4S, v4.4S, v27.S[0] // ......................................................................................................................*...................... + // sqrdmulh v30.4S, v4.4S, v27.S[1] // .....................................................................................................................*....................... + // sub v4.4S, v20.4S, v26.4S // ........................................................................................................................*.................... + // add v26.4S, v20.4S, v26.4S // .........................................................................................................................*................... + // sub v20.4S, v21.4S, v17.4S // ...............................................................................................................*............................. + // mls v23.4S, v30.4S, v8.S[0] // ..........................................................................................................................*.................. + // mul v30.4S, v4.4S, v27.S[0] // ..................................................................................................................................*.......... + // sqrdmulh v4.4S, v4.4S, v27.S[1] // ...........................................................................................................................*................. + // add v21.4S, v21.4S, v17.4S // ................................................................................................................*............................ + // mul v17.4S, v20.4S, v27.S[0] // ...................................................................................................................*......................... + // sqrdmulh v20.4S, v20.4S, v27.S[1] // ..................................................................................................................*.......................... + // mls v30.4S, v4.4S, v8.S[0] // .......................................................................................................................................*..... + // str q0, [x1], #(16*4) // .................................................................................................................................*........... + // mls v17.4S, v20.4S, v8.S[0] // ........................................................................................................................................*.... + // str q16, [x1, #-48] // ...................................................................................................*......................................... + // str q26, [x1, #-32] // ..............................................................................................................................*.............. + // str q21, [x1, #-16] // ...................................................................................................................................*......... + // add x1, x1, #64 // ....................................................................................................................................*........ + // str q2, [x2], #(16*4) // .........................................................................................................................................*... + // str q23, [x2, #-48] // ......................................................................................................................................*...... + // str q30, [x2, #-32] // ..........................................................................................................................................*.. + // str q17, [x2, #-16] // ...........................................................................................................................................*. + // add x2, x2, #64 // ............................................................................................................................................* + + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + ldr q13, [x0, #768] // .....*...... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q7, [x0, #896] // .......*.... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q4, [x0, #512] // ...*........ + // gap // ............ + // gap // ............ + // gap // ............ + ldr q11, [x0, #640] // ....*....... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q20, [x0, #384] // ........*... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q21, [x0, #256] // ..*......... + // gap // ............ + // gap // ............ + // gap // ............ + add v19.4S, v4.4S, v11.4S // ......*..... + // gap // ............ + add v6.4S, v13.4S, v7.4S // .........*.. + // gap // ............ + ldr q17, [x0, #0] // *........... + // gap // ............ + // gap // ............ + // gap // ............ + add v30.4S, v21.4S, v20.4S // ..........*. + // gap // ............ + add v10.4S, v19.4S, v6.4S // ...........* + // gap // ............ + ldr q23, [x0, #128] // .*.......... + // gap // ............ + + // original source code + // ldr q17, [x0, #0] // ........*... + // ldr q23, [x0, #128] // ...........* + // ldr q21, [x0, #256] // .....*...... + // ldr q4, [x0, #512] // ..*......... + // ldr q11, [x0, #640] // ...*........ + // ldr q13, [x0, #768] // *........... + // add v19.4S, v4.4S, v11.4S // ......*..... + // ldr q7, [x0, #896] // .*.......... + // ldr q20, [x0, #384] // ....*....... + // add v6.4S, v13.4S, v7.4S // .......*.... + // add v30.4S, v21.4S, v20.4S // .........*.. + // add v10.4S, v19.4S, v6.4S // ..........*. + + sub count, count, #1 +layer123_start: + sub v16.4S, v17.4S, v23.4S // ........*....................................................................................... + // gap // ................................................................................................ + add v23.4S, v17.4S, v23.4S // .........*...................................................................................... + // gap // ................................................................................................ + sub v21.4S, v21.4S, v20.4S // .............*.................................................................................. + // gap // ................................................................................................ + mul v20.4S, v16.4S, v1.S[2] // ..........*..................................................................................... + // gap // ................................................................................................ + sqrdmulh v16.4S, v16.4S, v1.S[3] // ...........*.................................................................................... + // gap // ................................................................................................ + sub v17.4S, v23.4S, v30.4S // ............................*................................................................... + // gap // ................................................................................................ + add v23.4S, v23.4S, v30.4S // .............................*.................................................................. + // gap // ................................................................................................ + mul v30.4S, v21.4S, v2.S[0] // ...............*................................................................................ + // gap // ................................................................................................ + sqrdmulh v21.4S, v21.4S, v2.S[1] // ................*............................................................................... + // gap // ................................................................................................ + mls v20.4S, v16.4S, v8.S[0] // ............*................................................................................... + // gap // ................................................................................................ + sub v16.4S, v4.4S, v11.4S // ..................*............................................................................. + // gap // ................................................................................................ + mul v4.4S, v17.4S, v0.S[2] // ..............................*................................................................. + // gap // ................................................................................................ + sqrdmulh v17.4S, v17.4S, v0.S[3] // ...............................*................................................................ + // gap // ................................................................................................ + sub v11.4S, v23.4S, v10.4S // ................................................*............................................... + // gap // ................................................................................................ + add v23.4S, v23.4S, v10.4S // .................................................*.............................................. + // gap // ................................................................................................ + mls v30.4S, v21.4S, v8.S[0] // .................*.............................................................................. + // gap // ................................................................................................ + mul v21.4S, v16.4S, v2.S[2] // ....................*........................................................................... + // gap // ................................................................................................ + sqrdmulh v16.4S, v16.4S, v2.S[3] // .....................*.......................................................................... + // gap // ................................................................................................ + sub v13.4S, v13.4S, v7.4S // .......................*........................................................................ + // gap // ................................................................................................ + sub v7.4S, v20.4S, v30.4S // .................................*.............................................................. + // gap // ................................................................................................ + add v20.4S, v20.4S, v30.4S // ..................................*............................................................. + // gap // ................................................................................................ + mls v21.4S, v16.4S, v8.S[0] // ......................*......................................................................... + // gap // ................................................................................................ + mul v16.4S, v13.4S, v3.S[0] // .........................*...................................................................... + // gap // ................................................................................................ + mls v4.4S, v17.4S, v8.S[0] // ................................*............................................................... + // gap // ................................................................................................ + sqrdmulh v17.4S, v13.4S, v3.S[1] // ..........................*..................................................................... + // gap // ................................................................................................ + mul v30.4S, v7.4S, v0.S[2] // ...................................*............................................................ + // gap // ................................................................................................ + sqrdmulh v13.4S, v7.4S, v0.S[3] // ....................................*........................................................... + // gap // ................................................................................................ + mul v7.4S, v11.4S, v0.S[0] // ..................................................*............................................. + // gap // ................................................................................................ + sqrdmulh v11.4S, v11.4S, v0.S[1] // ...................................................*............................................ + // gap // ................................................................................................ + mul v10.4S, v23.4S, v25.4S // ................................................................................*............... + // gap // ................................................................................................ + sqrdmulh v23.4S, v23.4S, v26.4S // .................................................................................*.............. + // gap // ................................................................................................ + mls v16.4S, v17.4S, v8.S[0] // ...........................*.................................................................... + // gap // ................................................................................................ + mls v30.4S, v13.4S, v8.S[0] // .....................................*.......................................................... + // gap // ................................................................................................ + sub v17.4S, v19.4S, v6.4S // ......................................*......................................................... + // gap // ................................................................................................ + mls v7.4S, v11.4S, v8.S[0] // ....................................................*........................................... + // gap // ................................................................................................ + sub v11.4S, v21.4S, v16.4S // ...........................................*.................................................... + // gap // ................................................................................................ + mul v13.4S, v17.4S, v1.S[0] // ........................................*....................................................... + // gap // ................................................................................................ + sqrdmulh v17.4S, v17.4S, v1.S[1] // .........................................*...................................................... + // gap // ................................................................................................ + add v16.4S, v21.4S, v16.4S // ............................................*................................................... + // gap // ................................................................................................ + mul v21.4S, v11.4S, v1.S[0] // .............................................*.................................................. + // gap // ................................................................................................ + sqrdmulh v11.4S, v11.4S, v1.S[1] // ..............................................*................................................. + // gap // ................................................................................................ + sub v19.4S, v20.4S, v16.4S // .....................................................*.......................................... + // gap // ................................................................................................ + add v16.4S, v20.4S, v16.4S // ......................................................*......................................... + // gap // ................................................................................................ + mls v13.4S, v17.4S, v8.S[0] // ..........................................*..................................................... + // gap // ................................................................................................ + mls v21.4S, v11.4S, v8.S[0] // ...............................................*................................................ + // gap // ................................................................................................ + mul v20.4S, v19.4S, v0.S[0] // .......................................................*........................................ + // gap // ................................................................................................ + sqrdmulh v17.4S, v19.4S, v0.S[1] // ........................................................*....................................... + // gap // ................................................................................................ + sub v11.4S, v4.4S, v13.4S // ..........................................................*..................................... + // gap // ................................................................................................ + add v4.4S, v4.4S, v13.4S // ...........................................................*.................................... + // gap // ................................................................................................ + sub v13.4S, v30.4S, v21.4S // ...............................................................*................................ + // gap // ................................................................................................ + mls v20.4S, v17.4S, v8.S[0] // .........................................................*...................................... + // gap // ................................................................................................ + mul v17.4S, v11.4S, v0.S[0] // ............................................................*................................... + // gap // ................................................................................................ + sqrdmulh v11.4S, v11.4S, v0.S[1] // .............................................................*.................................. + // gap // ................................................................................................ + add v21.4S, v30.4S, v21.4S // ................................................................*............................... + // gap // ................................................................................................ + mul v30.4S, v13.4S, v0.S[0] // .................................................................*.............................. + // gap // ................................................................................................ + sqrdmulh v13.4S, v13.4S, v0.S[1] // ..................................................................*............................. + // gap // ................................................................................................ + mls v17.4S, v11.4S, v8.S[0] // ..............................................................*................................. + // gap // ................................................................................................ + srshr v11.4S, v7.4S, #23 // ....................................................................*........................... + // gap // ................................................................................................ + srshr v19.4S, v20.4S, #23 // ......................................................................*......................... + // gap // ................................................................................................ + mls v10.4S, v23.4S, v8.S[0] // ..................................................................................*............. + // gap // ................................................................................................ + mls v30.4S, v13.4S, v8.S[0] // ...................................................................*............................ + // gap // ................................................................................................ + mls v7.4S, v11.4S, v8.4S // .....................................................................*.......................... + // gap // ................................................................................................ + mls v20.4S, v19.4S, v8.4S // .......................................................................*........................ + // gap // ................................................................................................ + srshr v23.4S, v17.4S, #23 // ........................................................................*....................... + // gap // ................................................................................................ + srshr v11.4S, v30.4S, #23 // ..........................................................................*..................... + // gap // ................................................................................................ + str q7, [x0, #512] // ............................................................................*................... + // gap // ................................................................................................ + mls v17.4S, v23.4S, v8.4S // .........................................................................*...................... + // gap // ................................................................................................ + mls v30.4S, v11.4S, v8.4S // ...........................................................................*.................... + // gap // ................................................................................................ + str q20, [x0, #640] // .............................................................................*.................. + // gap // ................................................................................................ + mul v23.4S, v16.4S, v25.4S // ...................................................................................*............ + // gap // ................................................................................................ + str q17, [x0, #768] // ..............................................................................*................. + // gap // ................................................................................................ + sqrdmulh v16.4S, v16.4S, v26.4S // ....................................................................................*........... + // gap // ................................................................................................ + str q30, [x0, #896] // ...............................................................................*................ + // gap // ................................................................................................ + mul v20.4S, v4.4S, v25.4S // ......................................................................................*......... + // gap // ................................................................................................ + sqrdmulh v17.4S, v4.4S, v26.4S // .......................................................................................*........ + // gap // ................................................................................................ + mls v23.4S, v16.4S, v8.S[0] // .....................................................................................*.......... + // gap // ................................................................................................ + mul v16.4S, v21.4S, v25.4S // .........................................................................................*...... + // gap // ................................................................................................ + sqrdmulh v21.4S, v21.4S, v26.4S // ..........................................................................................*..... + // gap // ................................................................................................ + mls v20.4S, v17.4S, v8.S[0] // ........................................................................................*....... + // gap // ................................................................................................ + str q10, [x0], #(16) // ............................................................................................*... + // gap // ................................................................................................ + ldr q17, [x0, #0] // e............................................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v16.4S, v21.4S, v8.S[0] // ...........................................................................................*.... + // gap // ................................................................................................ + str q23, [x0, #112] // .............................................................................................*.. + // gap // ................................................................................................ + ldr q23, [x0, #128] // .e.............................................................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + str q20, [x0, #240] // ..............................................................................................*. + // gap // ................................................................................................ + ldr q21, [x0, #256] // ..e............................................................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + ldr q4, [x0, #512] // ....e........................................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + ldr q11, [x0, #640] // .....e.......................................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + ldr q13, [x0, #768] // ......e......................................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + add v19.4S, v4.4S, v11.4S // ...................e............................................................................ + // gap // ................................................................................................ + ldr q7, [x0, #896] // .......e........................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + ldr q20, [x0, #384] // ...e............................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + add v6.4S, v13.4S, v7.4S // ........................e....................................................................... + // gap // ................................................................................................ + str q16, [x0, #368] // ...............................................................................................* + // gap // ................................................................................................ + add v30.4S, v21.4S, v20.4S // ..............e................................................................................. + // gap // ................................................................................................ + add v10.4S, v19.4S, v6.4S // .......................................e........................................................ + // gap // ................................................................................................ + + // original source code + // ldr q9, [x0, #0] // e...............|...............................................................................e............. + // ldr q10, [x0, #(1*(1024/8))] // ...e............|..................................................................................e.......... + // ldr q11, [x0, #(2*(1024/8))] // .....e..........|....................................................................................e........ + // ldr q12, [x0, #(3*(1024/8))] // ...........e....|..........................................................................................e.. + // ldr q13, [x0, #(4*(1024/8))] // ......e.........|.....................................................................................e....... + // ldr q14, [x0, #(5*(1024/8))] // .......e........|......................................................................................e...... + // ldr q15, [x0, #(6*(1024/8))] // ........e.......|.......................................................................................e..... + // ldr q16, [x0, #(7*(1024/8))] // ..........e.....|.........................................................................................e... + // sub v24.4s, v9.4s, v10.4s // ................*............................................................................................. + // add v9.4s, v9.4s, v10.4s // ................|*............................................................................................ + // mul v10.4s, v24.4s, v1.s[2] // ................|..*.......................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ................|...*......................................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ................|........*.................................................................................... + // sub v24.4s, v11.4s, v12.4s // ................|.*........................................................................................... + // add v11.4s, v11.4s, v12.4s // ..............e.|............................................................................................. + // mul v12.4s, v24.4s, v2.s[0] // ................|......*...................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................|.......*..................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ................|..............*.............................................................................. + // sub v24.4s, v13.4s, v14.4s // ................|.........*................................................................................... + // add v13.4s, v13.4s, v14.4s // .........e......|........................................................................................e.... + // mul v14.4s, v24.4s, v2.s[2] // ................|...............*............................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................|................*............................................................................ + // mls v14.4s, v24.4s, v8.s[0] // ................|....................*........................................................................ + // sub v24.4s, v15.4s, v16.4s // ................|.................*........................................................................... + // add v15.4s, v15.4s, v16.4s // ............e...|...........................................................................................e. + // mul v16.4s, v24.4s, v3.s[0] // ................|.....................*....................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................|.......................*..................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ................|..............................*.............................................................. + // sub v24.4s, v9.4s, v11.4s // ................|....*........................................................................................ + // add v9.4s, v9.4s, v11.4s // ................|.....*....................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ................|..........*.................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|...........*................................................................................. + // mls v11.4s, v24.4s, v8.s[0] // ................|......................*...................................................................... + // sub v24.4s, v10.4s, v12.4s // ................|..................*.......................................................................... + // add v10.4s, v10.4s, v12.4s // ................|...................*......................................................................... + // mul v12.4s, v24.4s, v0.s[2] // ................|........................*.................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|.........................*................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ................|...............................*............................................................. + // sub v24.4s, v13.4s, v15.4s // ................|................................*............................................................ + // add v13.4s, v13.4s, v15.4s // ...............e|............................................................................................. + // mul v15.4s, v24.4s, v1.s[0] // ................|...................................*......................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|....................................*........................................................ + // mls v15.4s, v24.4s, v8.s[0] // ................|..........................................*.................................................. + // sub v24.4s, v14.4s, v16.4s // ................|..................................*.......................................................... + // add v14.4s, v14.4s, v16.4s // ................|.....................................*....................................................... + // mul v16.4s, v24.4s, v1.s[0] // ................|......................................*...................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|.......................................*..................................................... + // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................*................................................. + // sub v24.4s, v9.4s, v13.4s // ................|............*................................................................................ + // add v9.4s, v9.4s, v13.4s // ................|.............*............................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ................|..........................*.................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...........................*................................................................. + // mls v13.4s, v24.4s, v8.s[0] // ................|.................................*........................................................... + // sub v24.4s, v10.4s, v14.4s // ................|........................................*.................................................... + // add v10.4s, v10.4s, v14.4s // ................|.........................................*................................................... + // mul v14.4s, v24.4s, v0.s[0] // ................|............................................*................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|.............................................*............................................... + // mls v14.4s, v24.4s, v8.s[0] // ................|.................................................*........................................... + // sub v24.4s, v11.4s, v15.4s // ................|..............................................*.............................................. + // add v11.4s, v11.4s, v15.4s // ................|...............................................*............................................. + // mul v15.4s, v24.4s, v0.s[0] // ................|..................................................*.......................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...................................................*......................................... + // mls v15.4s, v24.4s, v8.s[0] // ................|.......................................................*..................................... + // sub v24.4s, v12.4s, v16.4s // ................|................................................*............................................ + // add v12.4s, v12.4s, v16.4s // ................|....................................................*........................................ + // mul v16.4s, v24.4s, v0.s[0] // ................|.....................................................*....................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|......................................................*...................................... + // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................................*................................. + // srshr v24.4S, v13.4S, #23 // ................|........................................................*.................................... + // mls v13.4s, v24.4s, v8.4s // ................|............................................................*................................ + // srshr v24.4S, v14.4S, #23 // ................|.........................................................*................................... + // mls v14.4s, v24.4s, v8.4s // ................|.............................................................*............................... + // srshr v24.4S, v15.4S, #23 // ................|..............................................................*.............................. + // mls v15.4s, v24.4s, v8.4s // ................|.................................................................*........................... + // srshr v24.4S, v16.4S, #23 // ................|...............................................................*............................. + // mls v16.4s, v24.4s, v8.4s // ................|..................................................................*.......................... + // str q13, [x0, #(4*(1024/8))] // ................|................................................................*............................ + // str q14, [x0, #(5*(1024/8))] // ................|...................................................................*......................... + // str q15, [x0, #(6*(1024/8))] // ................|.....................................................................*....................... + // str q16, [x0, #(7*(1024/8))] // ................|.......................................................................*..................... + // mul v13.4s, v9.4s, v25.4s // ................|............................*................................................................ + // sqrdmulh v9.4s, v9.4s, v26.4s // ................|.............................*............................................................... + // mls v13.4s, v9.4s, v8.s[0] // ................|..........................................................*.................................. + // mul v14.4s, v10.4s, v25.4s // ................|....................................................................*........................ + // sqrdmulh v10.4s, v10.4s, v26.4s // ................|......................................................................*...................... + // mls v14.4s, v10.4s, v8.s[0] // ................|..........................................................................*.................. + // mul v15.4s, v11.4s, v25.4s // ................|........................................................................*.................... + // sqrdmulh v11.4s, v11.4s, v26.4s // ................|.........................................................................*................... + // mls v15.4s, v11.4s, v8.s[0] // ................|.............................................................................*............... + // mul v16.4s, v12.4s, v25.4s // ................|...........................................................................*................. + // sqrdmulh v12.4s, v12.4s, v26.4s // ................|............................................................................*................ + // mls v16.4s, v12.4s, v8.s[0] // .*..............|................................................................................*............ + // str q13, [x0], #(16) // ................|..............................................................................*.............. + // str q14, [x0, #(-16 + 1*(1024/8))] // ..*.............|.................................................................................*........... + // str q15, [x0, #(-16 + 2*(1024/8))] // ....*...........|...................................................................................*......... + // str q16, [x0, #(-16 + 3*(1024/8))] // .............*..|............................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + sub v16.4S, v4.4S, v11.4S // ..........*......................................................................... + // gap // .................................................................................... + add v12.4S, v17.4S, v23.4S // .*.................................................................................. + // gap // .................................................................................... + sub v29.4S, v21.4S, v20.4S // ..*................................................................................. + // gap // .................................................................................... + mul v21.4S, v16.4S, v2.S[2] // ................*................................................................... + // gap // .................................................................................... + sub v18.4S, v17.4S, v23.4S // *................................................................................... + // gap // .................................................................................... + mul v11.4S, v29.4S, v2.S[0] // .......*............................................................................ + // gap // .................................................................................... + sub v23.4S, v13.4S, v7.4S // ..................*................................................................. + // gap // .................................................................................... + sqrdmulh v16.4S, v16.4S, v2.S[3] // .................*.................................................................. + // gap // .................................................................................... + sqrdmulh v7.4S, v18.4S, v1.S[3] // ....*............................................................................... + // gap // .................................................................................... + mul v20.4S, v23.4S, v3.S[0] // ......................*............................................................. + // gap // .................................................................................... + sqrdmulh v17.4S, v23.4S, v3.S[1] // ........................*........................................................... + // gap // .................................................................................... + sqrdmulh v23.4S, v29.4S, v2.S[1] // ........*........................................................................... + // gap // .................................................................................... + mul v4.4S, v18.4S, v1.S[2] // ...*................................................................................ + // gap // .................................................................................... + mls v21.4S, v16.4S, v8.S[0] // .....................*.............................................................. + // gap // .................................................................................... + mls v20.4S, v17.4S, v8.S[0] // ...............................*.................................................... + // gap // .................................................................................... + mls v11.4S, v23.4S, v8.S[0] // ...............*.................................................................... + // gap // .................................................................................... + mls v4.4S, v7.4S, v8.S[0] // .........*.......................................................................... + // gap // .................................................................................... + sub v13.4S, v12.4S, v30.4S // .....*.............................................................................. + // gap // .................................................................................... + sub v23.4S, v21.4S, v20.4S // ...................................*................................................ + // gap // .................................................................................... + add v21.4S, v21.4S, v20.4S // ......................................*............................................. + // gap // .................................................................................... + sub v17.4S, v4.4S, v11.4S // ...................*................................................................ + // gap // .................................................................................... + sqrdmulh v16.4S, v23.4S, v1.S[1] // ........................................*........................................... + // gap // .................................................................................... + mul v23.4S, v23.4S, v1.S[0] // .......................................*............................................ + // gap // .................................................................................... + sqrdmulh v20.4S, v17.4S, v0.S[3] // ..........................*......................................................... + // gap // .................................................................................... + mul v17.4S, v17.4S, v0.S[2] // .........................*.......................................................... + // gap // .................................................................................... + mul v24.4S, v13.4S, v0.S[2] // ...........*........................................................................ + // gap // .................................................................................... + mls v23.4S, v16.4S, v8.S[0] // ............................................*....................................... + // gap // .................................................................................... + add v16.4S, v4.4S, v11.4S // ....................*............................................................... + // gap // .................................................................................... + mls v17.4S, v20.4S, v8.S[0] // ................................*................................................... + // gap // .................................................................................... + sqrdmulh v18.4S, v13.4S, v0.S[3] // ............*....................................................................... + // gap // .................................................................................... + add v20.4S, v16.4S, v21.4S // ..........................................*......................................... + // gap // .................................................................................... + sub v13.4S, v16.4S, v21.4S // .........................................*.......................................... + // gap // .................................................................................... + add v16.4S, v17.4S, v23.4S // .....................................................*.............................. + // gap // .................................................................................... + sub v23.4S, v17.4S, v23.4S // .................................................*.................................. + // gap // .................................................................................... + sqrdmulh v21.4S, v20.4S, v26.4S // .......................................................................*............ + // gap // .................................................................................... + mul v11.4S, v16.4S, v25.4S // ............................................................................*....... + // gap // .................................................................................... + mul v29.4S, v20.4S, v25.4S // .....................................................................*.............. + // gap // .................................................................................... + sqrdmulh v16.4S, v16.4S, v26.4S // .............................................................................*...... + // gap // .................................................................................... + sub v5.4S, v19.4S, v6.4S // .................................*.................................................. + // gap // .................................................................................... + sqrdmulh v20.4S, v23.4S, v0.S[1] // .......................................................*............................ + // gap // .................................................................................... + mls v29.4S, v21.4S, v8.S[0] // ...........................................................................*........ + // gap // .................................................................................... + mls v11.4S, v16.4S, v8.S[0] // ................................................................................*... + // gap // .................................................................................... + sqrdmulh v16.4S, v5.4S, v1.S[1] // .....................................*.............................................. + // gap // .................................................................................... + mul v7.4S, v5.4S, v1.S[0] // ....................................*............................................... + // gap // .................................................................................... + str q29, [x0, #128] // .................................................................................*.. + // gap // .................................................................................... + sqrdmulh v4.4S, v13.4S, v0.S[1] // ..............................................*..................................... + // gap // .................................................................................... + mls v24.4S, v18.4S, v8.S[0] // .......................*............................................................ + // gap // .................................................................................... + mls v7.4S, v16.4S, v8.S[0] // ...........................................*........................................ + // gap // .................................................................................... + mul v17.4S, v23.4S, v0.S[0] // ......................................................*............................. + // gap // .................................................................................... + add v22.4S, v12.4S, v30.4S // ......*............................................................................. + // gap // .................................................................................... + mul v13.4S, v13.4S, v0.S[0] // .............................................*...................................... + // gap // .................................................................................... + sub v21.4S, v24.4S, v7.4S // ...............................................*.................................... + // gap // .................................................................................... + sub v16.4S, v22.4S, v10.4S // .............*...................................................................... + // gap // .................................................................................... + add v18.4S, v24.4S, v7.4S // ................................................*................................... + // gap // .................................................................................... + sqrdmulh v30.4S, v21.4S, v0.S[1] // ....................................................*............................... + // gap // .................................................................................... + sqrdmulh v19.4S, v16.4S, v0.S[1] // ............................*....................................................... + // gap // .................................................................................... + mul v7.4S, v16.4S, v0.S[0] // ...........................*........................................................ + // gap // .................................................................................... + sqrdmulh v16.4S, v18.4S, v26.4S // ..........................................................................*......... + // gap // .................................................................................... + mul v23.4S, v18.4S, v25.4S // .........................................................................*.......... + // gap // .................................................................................... + mls v17.4S, v20.4S, v8.S[0] // ............................................................*....................... + // gap // .................................................................................... + mls v7.4S, v19.4S, v8.S[0] // ..................................*................................................. + // gap // .................................................................................... + mul v20.4S, v21.4S, v0.S[0] // ...................................................*................................ + // gap // .................................................................................... + mls v23.4S, v16.4S, v8.S[0] // ..............................................................................*..... + // gap // .................................................................................... + add v10.4S, v22.4S, v10.4S // ..............*..................................................................... + // gap // .................................................................................... + srshr v16.4S, v7.4S, #23 // .........................................................*.......................... + // gap // .................................................................................... + srshr v21.4S, v17.4S, #23 // ................................................................*................... + // gap // .................................................................................... + str q23, [x0, #256] // ..................................................................................*. + // gap // .................................................................................... + mls v7.4S, v16.4S, v8.4S // .............................................................*...................... + // gap // .................................................................................... + mls v17.4S, v21.4S, v8.4S // ...................................................................*................ + // gap // .................................................................................... + mls v20.4S, v30.4S, v8.S[0] // ........................................................*........................... + // gap // .................................................................................... + mul v22.4S, v10.4S, v25.4S // .............................*...................................................... + // gap // .................................................................................... + str q7, [x0, #512] // .................................................................*.................. + // gap // .................................................................................... + mls v13.4S, v4.4S, v8.S[0] // ..................................................*................................. + // gap // .................................................................................... + str q17, [x0, #896] // ........................................................................*........... + // gap // .................................................................................... + srshr v16.4S, v20.4S, #23 // ...............................................................*.................... + // gap // .................................................................................... + sqrdmulh v30.4S, v10.4S, v26.4S // ..............................*..................................................... + // gap // .................................................................................... + srshr v21.4S, v13.4S, #23 // ..........................................................*......................... + // gap // .................................................................................... + mls v20.4S, v16.4S, v8.4S // ..................................................................*................. + // gap // .................................................................................... + str q11, [x0, #384] // ...................................................................................* + // gap // .................................................................................... + mls v22.4S, v30.4S, v8.S[0] // ...........................................................*........................ + // gap // .................................................................................... + mls v13.4S, v21.4S, v8.4S // ..............................................................*..................... + // gap // .................................................................................... + str q20, [x0, #768] // ......................................................................*............. + // gap // .................................................................................... + // gap // .................................................................................... + // gap // .................................................................................... + str q22, [x0], #(16) // ...............................................................................*.... + // gap // .................................................................................... + // gap // .................................................................................... + // gap // .................................................................................... + str q13, [x0, #624] // ....................................................................*............... + // gap // .................................................................................... + + // original source code + // sub v16.4S, v17.4S, v23.4S // ....*............................................................................... + // add v23.4S, v17.4S, v23.4S // .*.................................................................................. + // sub v21.4S, v21.4S, v20.4S // ..*................................................................................. + // mul v20.4S, v16.4S, v1.S[2] // ............*....................................................................... + // sqrdmulh v16.4S, v16.4S, v1.S[3] // ........*........................................................................... + // sub v17.4S, v23.4S, v30.4S // .................*.................................................................. + // add v23.4S, v23.4S, v30.4S // .................................................*.................................. + // mul v30.4S, v21.4S, v2.S[0] // .....*.............................................................................. + // sqrdmulh v21.4S, v21.4S, v2.S[1] // ...........*........................................................................ + // mls v20.4S, v16.4S, v8.S[0] // ................*................................................................... + // sub v16.4S, v4.4S, v11.4S // *................................................................................... + // mul v4.4S, v17.4S, v0.S[2] // .........................*.......................................................... + // sqrdmulh v17.4S, v17.4S, v0.S[3] // .............................*...................................................... + // sub v11.4S, v23.4S, v10.4S // ....................................................*............................... + // add v23.4S, v23.4S, v10.4S // ...............................................................*.................... + // mls v30.4S, v21.4S, v8.S[0] // ...............*.................................................................... + // mul v21.4S, v16.4S, v2.S[2] // ...*................................................................................ + // sqrdmulh v16.4S, v16.4S, v2.S[3] // .......*............................................................................ + // sub v13.4S, v13.4S, v7.4S // ......*............................................................................. + // sub v7.4S, v20.4S, v30.4S // ....................*............................................................... + // add v20.4S, v20.4S, v30.4S // ...........................*........................................................ + // mls v21.4S, v16.4S, v8.S[0] // .............*...................................................................... + // mul v16.4S, v13.4S, v3.S[0] // .........*.......................................................................... + // mls v4.4S, v17.4S, v8.S[0] // ..............................................*..................................... + // sqrdmulh v17.4S, v13.4S, v3.S[1] // ..........*......................................................................... + // mul v30.4S, v7.4S, v0.S[2] // ........................*........................................................... + // sqrdmulh v13.4S, v7.4S, v0.S[3] // .......................*............................................................ + // mul v7.4S, v11.4S, v0.S[0] // ........................................................*........................... + // sqrdmulh v11.4S, v11.4S, v0.S[1] // .......................................................*............................ + // mul v10.4S, v23.4S, v25.4S // ......................................................................*............. + // sqrdmulh v23.4S, v23.4S, v26.4S // ...........................................................................*........ + // mls v16.4S, v17.4S, v8.S[0] // ..............*..................................................................... + // mls v30.4S, v13.4S, v8.S[0] // ............................*....................................................... + // sub v17.4S, v19.4S, v6.4S // ......................................*............................................. + // mls v7.4S, v11.4S, v8.S[0] // ............................................................*....................... + // sub v11.4S, v21.4S, v16.4S // ..................*................................................................. + // mul v13.4S, v17.4S, v1.S[0] // ...........................................*........................................ + // sqrdmulh v17.4S, v17.4S, v1.S[1] // ..........................................*......................................... + // add v16.4S, v21.4S, v16.4S // ...................*................................................................ + // mul v21.4S, v11.4S, v1.S[0] // ......................*............................................................. + // sqrdmulh v11.4S, v11.4S, v1.S[1] // .....................*.............................................................. + // sub v19.4S, v20.4S, v16.4S // ...............................*.................................................... + // add v16.4S, v20.4S, v16.4S // ..............................*..................................................... + // mls v13.4S, v17.4S, v8.S[0] // ...............................................*.................................... + // mls v21.4S, v11.4S, v8.S[0] // ..........................*......................................................... + // mul v20.4S, v19.4S, v0.S[0] // ..................................................*................................. + // sqrdmulh v17.4S, v19.4S, v0.S[1] // .............................................*...................................... + // sub v11.4S, v4.4S, v13.4S // ...................................................*................................ + // add v4.4S, v4.4S, v13.4S // .....................................................*.............................. + // sub v13.4S, v30.4S, v21.4S // .................................*.................................................. + // mls v20.4S, v17.4S, v8.S[0] // ........................................................................*........... + // mul v17.4S, v11.4S, v0.S[0] // .............................................................*...................... + // sqrdmulh v11.4S, v11.4S, v0.S[1] // ......................................................*............................. + // add v21.4S, v30.4S, v21.4S // ................................*................................................... + // mul v30.4S, v13.4S, v0.S[0] // ................................................*................................... + // sqrdmulh v13.4S, v13.4S, v0.S[1] // .......................................*............................................ + // mls v17.4S, v11.4S, v8.S[0] // .....................................................................*.............. + // srshr v11.4S, v7.4S, #23 // ................................................................*................... + // srshr v19.4S, v20.4S, #23 // ............................................................................*....... + // mls v10.4S, v23.4S, v8.S[0] // ...............................................................................*.... + // mls v30.4S, v13.4S, v8.S[0] // ...........................................................*........................ + // mls v7.4S, v11.4S, v8.4S // ...................................................................*................ + // mls v20.4S, v19.4S, v8.4S // ................................................................................*... + // srshr v23.4S, v17.4S, #23 // ..........................................................................*......... + // srshr v11.4S, v30.4S, #23 // .................................................................*.................. + // str q7, [x0, #512] // .......................................................................*............ + // mls v17.4S, v23.4S, v8.4S // .............................................................................*...... + // mls v30.4S, v11.4S, v8.4S // ....................................................................*............... + // str q20, [x0, #640] // ...................................................................................* + // mul v23.4S, v16.4S, v25.4S // ....................................*............................................... + // str q17, [x0, #768] // .................................................................................*.. + // sqrdmulh v16.4S, v16.4S, v26.4S // ..................................*................................................. + // str q30, [x0, #896] // .........................................................................*.......... + // mul v20.4S, v4.4S, v25.4S // ..........................................................*......................... + // sqrdmulh v17.4S, v4.4S, v26.4S // .........................................................*.......................... + // mls v23.4S, v16.4S, v8.S[0] // ........................................*........................................... + // mul v16.4S, v21.4S, v25.4S // ...................................*................................................ + // sqrdmulh v21.4S, v21.4S, v26.4S // .....................................*.............................................. + // mls v20.4S, v17.4S, v8.S[0] // ..............................................................*..................... + // str q10, [x0], #(16) // ..................................................................................*. + // mls v16.4S, v21.4S, v8.S[0] // .........................................*.......................................... + // str q23, [x0, #112] // ............................................*....................................... + // str q20, [x0, #240] // ..................................................................*................. + // str q16, [x0, #368] // ..............................................................................*..... + + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a72.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a72.s new file mode 100644 index 0000000..87a99b7 --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a72.s @@ -0,0 +1,2395 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_manual_ld4_opt_a72 + .global _intt_dilithium_123_45678_manual_ld4_opt_a72 + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_manual_ld4_opt_a72: +_intt_dilithium_123_45678_manual_ld4_opt_a72: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + ldr q19, [x1, #48] // .......*.............. + ldr q27, [x1, #32] // ......*............... + // gap // ...................... + ldr q5, [x1, #16] // .....*................ + ldr q20, [x1, #0] // ....*................. + // gap // ...................... + ldr q30, [x5, #64] // ..*................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + trn2 v17.4S, v27.4S, v19.4S // ...........*.......... + trn1 v21.4S, v27.4S, v19.4S // ..........*........... + // gap // ...................... + trn2 v14.4S, v20.4S, v5.4S // .........*............ + trn1 v5.4S, v20.4S, v5.4S // ........*............. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + ldr q20, [x5, #32] // *..................... + // gap // ...................... + // gap // ...................... + trn1 v24.2D, v5.2D, v21.2D // ............*......... + trn1 v6.2D, v14.2D, v17.2D // .............*........ + // gap // ...................... + trn2 v9.2D, v14.2D, v17.2D // ...............*...... + ldr q17, [x5, #80] // ...*.................. + // gap // ...................... + trn2 v5.2D, v5.2D, v21.2D // ..............*....... + // gap // ...................... + // gap // ...................... + sub v22.4S, v24.4S, v6.4S // ................*..... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sub v26.4S, v5.4S, v9.4S // .................*.... + // gap // ...................... + // gap // ...................... + mul v2.4S, v22.4S, v20.4S // ....................*. + add v15.4S, v5.4S, v9.4S // ...................*.. + ldr q20, [x5, #48] // .*.................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sqrdmulh v13.4S, v26.4S, v17.4S // .....................* + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sqrdmulh v28.4S, v22.4S, v20.4S // ..................*... + // gap // ...................... + // gap // ...................... + + // original source code + // ldr q3, [x5, #32] // .........*............ + // ldr q10, [x5, #48] // ...................*.. + // ldr q30, [x5, #64] // ....*................. + // ldr q22, [x5, #80] // .............*........ + // ldr q21, [x1, #0] // ...*.................. + // ldr q31, [x1, #16] // ..*................... + // ldr q15, [x1, #32] // .*.................... + // ldr q29, [x1, #48] // *..................... + // trn1 v13.4S, v21.4S, v31.4S // ........*............. + // trn2 v21.4S, v21.4S, v31.4S // .......*.............. + // trn1 v31.4S, v15.4S, v29.4S // ......*............... + // trn2 v26.4S, v15.4S, v29.4S // .....*................ + // trn1 v24.2D, v13.2D, v31.2D // ..........*........... + // trn1 v6.2D, v21.2D, v26.2D // ...........*.......... + // trn2 v31.2D, v13.2D, v31.2D // ..............*....... + // trn2 v21.2D, v21.2D, v26.2D // ............*......... + // sub v14.4S, v24.4S, v6.4S // ...............*...... + // sub v26.4S, v31.4S, v21.4S // ................*..... + // sqrdmulh v28.4S, v14.4S, v10.4S // .....................* + // add v15.4S, v31.4S, v21.4S // ..................*... + // mul v2.4S, v14.4S, v3.4S // .................*.... + // sqrdmulh v13.4S, v26.4S, v22.4S // ....................*. + + sub count, count, #1 +layer45678_start: + ldr q3, [x5, #224] // ..........................e........................................................................................................................................... + add v17.4S, v24.4S, v6.4S // ...............................*...................................................................................................................................... + ldr q21, [x2, #0] // ............*......................................................................................................................................................... + ldr q14, [x2, #16] // .............*........................................................................................................................................................ + mls v2.4S, v28.4S, v8.S[0] // ..................................*................................................................................................................................... + ldr q24, [x5, #16] // .........................*............................................................................................................................................ + ldr q28, [x2, #32] // ..............*....................................................................................................................................................... + ldr q6, [x5, #96] // ..................................................*................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v31.4S, v26.4S, v30.4S // .....................................*................................................................................................................................ + ldr q26, [x2, #48] // ...............*...................................................................................................................................................... + sub v1.4S, v17.4S, v15.4S // ........................................*............................................................................................................................. + add v17.4S, v17.4S, v15.4S // .........................................*............................................................................................................................ + ldr q15, [x5], #(12*16) // ........................*............................................................................................................................................. + ldr q29, [x5, #-80] // ...................................................*.................................................................................................................. + mls v31.4S, v13.4S, v8.S[0] // .......................................*.............................................................................................................................. + trn1 v13.4S, v21.4S, v14.4S // ................*..................................................................................................................................................... + ldr q16, [x5, #-64] // ....................................................*................................................................................................................. + trn2 v21.4S, v21.4S, v14.4S // .................*.................................................................................................................................................... + ldr q14, [x5, #-16] // .......................................................*.............................................................................................................. + ldr q9, [x5, #-48] // .....................................................*................................................................................................................ + sqrdmulh v0.4S, v1.4S, v24.4S // ...........................................*.......................................................................................................................... + trn1 v30.4S, v28.4S, v26.4S // ..................*................................................................................................................................................... + ldr q7, [x5, #-32] // ......................................................*............................................................................................................... + trn2 v28.4S, v28.4S, v26.4S // ...................*.................................................................................................................................................. + ldr q26, [x4], #64 // ............................................................................................*......................................................................... + // gap // ...................................................................................................................................................................... + mul v1.4S, v1.4S, v15.4S // ..........................................*........................................................................................................................... + ldr q18, [x4, #-48] // .............................................................................................*........................................................................ + // gap // ...................................................................................................................................................................... + sub v25.4S, v2.4S, v31.4S // .............................................*........................................................................................................................ + ldr q12, [x4, #-32] // ..............................................................................................*....................................................................... + // gap // ...................................................................................................................................................................... + add v2.4S, v2.4S, v31.4S // ..............................................*....................................................................................................................... + trn2 v31.2D, v13.2D, v30.2D // ....................*................................................................................................................................................. + ldr q23, [x4, #-16] // ...............................................................................................*...................................................................... + mls v1.4S, v0.4S, v8.S[0] // ............................................*......................................................................................................................... + trn2 v0.2D, v21.2D, v28.2D // .....................*................................................................................................................................................ + ldr q10, [x5, #48] // ...........................e.......................................................................................................................................... + trn1 v13.2D, v13.2D, v30.2D // ......................*............................................................................................................................................... + ldr q30, [x5, #64] // ............................e......................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v24.4S, v25.4S, v24.4S // ................................................*..................................................................................................................... + trn1 v4.4S, v17.4S, v2.4S // ............................................................................*......................................................................................... + ldr q22, [x5, #80] // .............................e........................................................................................................................................ + sub v11.4S, v31.4S, v0.4S // .............................................................*........................................................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn2 v17.4S, v17.4S, v2.4S // .............................................................................*........................................................................................ + mul v2.4S, v25.4S, v15.4S // ...............................................*...................................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v21.2D, v21.2D, v28.2D // .......................*.............................................................................................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v28.4S, v11.4S, v7.4S // ...............................................................*...................................................................................................... + add v31.4S, v31.4S, v0.4S // ..............................................................*....................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v2.4S, v24.4S, v8.S[0] // .................................................*.................................................................................................................... + sub v24.4S, v13.4S, v21.4S // ........................................................*............................................................................................................. + // gap // ...................................................................................................................................................................... + add v21.4S, v13.4S, v21.4S // .........................................................*............................................................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v14.4S, v11.4S, v14.4S // ................................................................*..................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v15.4S, v24.4S, v9.4S // ...........................................................*.......................................................................................................... + sub v13.4S, v21.4S, v31.4S // ..................................................................*................................................................................................... + // gap // ...................................................................................................................................................................... + add v21.4S, v21.4S, v31.4S // ...................................................................*.................................................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v24.4S, v24.4S, v16.4S // ..........................................................*........................................................................................................... + trn2 v31.4S, v1.4S, v2.4S // ...............................................................................*...................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v2.4S, v1.4S, v2.4S // ..............................................................................*....................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v28.4S, v14.4S, v8.S[0] // .................................................................*.................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn2 v14.2D, v17.2D, v31.2D // .................................................................................*.................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v24.4S, v15.4S, v8.S[0] // ............................................................*......................................................................................................... + trn2 v1.2D, v4.2D, v2.2D // ................................................................................*..................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v2.2D, v4.2D, v2.2D // ..................................................................................*................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v17.2D, v17.2D, v31.2D // ...................................................................................*.................................................................................. + mul v31.4S, v13.4S, v6.4S // ....................................................................*................................................................................................. + // gap // ...................................................................................................................................................................... + sub v15.4S, v1.4S, v14.4S // .....................................................................................................*................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v13.4S, v13.4S, v29.4S // .....................................................................*................................................................................................ + add v14.4S, v1.4S, v14.4S // ......................................................................................................*............................................................... + // gap // ...................................................................................................................................................................... + sub v1.4S, v24.4S, v28.4S // .......................................................................*.............................................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + add v24.4S, v24.4S, v28.4S // ........................................................................*............................................................................................. + mul v28.4S, v15.4S, v12.S[0] // .......................................................................................................*.............................................................. + // gap // ...................................................................................................................................................................... + sub v16.4S, v2.4S, v17.4S // ................................................................................................*..................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v6.4S, v1.4S, v6.4S // .........................................................................*............................................................................................ + add v17.4S, v2.4S, v17.4S // .................................................................................................*.................................................................... + // gap // ...................................................................................................................................................................... + trn1 v2.4S, v21.4S, v24.4S // ....................................................................................*................................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v1.4S, v1.4S, v29.4S // ..........................................................................*........................................................................................... + trn2 v21.4S, v21.4S, v24.4S // .....................................................................................*................................................................................ + // gap // ...................................................................................................................................................................... + sub v24.4S, v17.4S, v14.4S // ....................................................................................................................*................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v31.4S, v13.4S, v8.S[0] // ......................................................................*............................................................................................... + add v17.4S, v17.4S, v14.4S // .....................................................................................................................*................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v14.4S, v16.4S, v18.S[3] // ...................................................................................................*.................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v6.4S, v1.4S, v8.S[0] // ...........................................................................*.......................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v1.4S, v16.4S, v18.S[2] // ..................................................................................................*................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v1.4S, v14.4S, v8.S[0] // ....................................................................................................*................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v14.4S, v31.4S, v6.4S // ......................................................................................*............................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v15.4S, v15.4S, v12.S[1] // ........................................................................................................*............................................................. + trn2 v6.4S, v31.4S, v6.4S // .......................................................................................*.............................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v31.4S, v24.4S, v26.S[3] // .......................................................................................................................*.............................................. + trn2 v29.2D, v2.2D, v14.2D // ........................................................................................*............................................................................. + // gap // ...................................................................................................................................................................... + trn1 v14.2D, v2.2D, v14.2D // ..........................................................................................*........................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v2.4S, v24.4S, v26.S[2] // ......................................................................................................................*............................................... + trn1 v24.2D, v21.2D, v6.2D // ...........................................................................................*.......................................................................... + // gap // ...................................................................................................................................................................... + trn2 v21.2D, v21.2D, v6.2D // .........................................................................................*............................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v28.4S, v15.4S, v8.S[0] // .........................................................................................................*............................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v6.4S, v14.4S, v24.4S // ..........................................................................................................*........................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v2.4S, v31.4S, v8.S[0] // ........................................................................................................................*............................................. + add v14.4S, v14.4S, v24.4S // ...........................................................................................................*.......................................................... + // gap // ...................................................................................................................................................................... + sub v24.4S, v29.4S, v21.4S // ...............................................................................................................*...................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + add v21.4S, v29.4S, v21.4S // ................................................................................................................*..................................................... + mul v31.4S, v6.4S, v12.S[2] // ............................................................................................................*......................................................... + // gap // ...................................................................................................................................................................... + sub v15.4S, v1.4S, v28.4S // .........................................................................................................................*............................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v6.4S, v6.4S, v12.S[3] // .............................................................................................................*........................................................ + add v28.4S, v1.4S, v28.4S // ..........................................................................................................................*........................................... + // gap // ...................................................................................................................................................................... + sub v1.4S, v14.4S, v21.4S // ..............................................................................................................................*....................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + add v21.4S, v14.4S, v21.4S // ...............................................................................................................................*...................................... + sqrdmulh v14.4S, v24.4S, v23.S[1] // ..................................................................................................................*................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v24.4S, v24.4S, v23.S[0] // .................................................................................................................*.................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v29.4S, v17.4S, v21.4S // ........................................................................................................................................*............................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + add v17.4S, v17.4S, v21.4S // .........................................................................................................................................*............................ + mls v31.4S, v6.4S, v8.S[0] // ..............................................................................................................*....................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v24.4S, v14.4S, v8.S[0] // ...................................................................................................................*.................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + str q17, [x1], #(16*4) // ............................................................................................................................................................*......... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v17.4S, v15.4S, v26.S[2] // ...........................................................................................................................*.......................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v21.4S, v1.4S, v18.S[0] // ................................................................................................................................*..................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v14.4S, v31.4S, v24.4S // ...................................................................................................................................*.................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v6.4S, v1.4S, v18.S[1] // .................................................................................................................................*.................................... + add v24.4S, v31.4S, v24.4S // ....................................................................................................................................*................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v31.4S, v14.4S, v18.S[0] // .....................................................................................................................................*................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v1.4S, v28.4S, v24.4S // .............................................................................................................................................*........................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v14.4S, v14.4S, v18.S[1] // ......................................................................................................................................*............................... + add v24.4S, v28.4S, v24.4S // ..............................................................................................................................................*....................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v28.4S, v15.4S, v26.S[3] // ............................................................................................................................*......................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + str q24, [x1, #-48] // .............................................................................................................................................................*........ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v21.4S, v6.4S, v8.S[0] // ..................................................................................................................................*................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v31.4S, v14.4S, v8.S[0] // .......................................................................................................................................*.............................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v17.4S, v28.4S, v8.S[0] // .............................................................................................................................*........................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v14.4S, v2.4S, v21.4S // ..................................................................................................................................................*................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + add v21.4S, v2.4S, v21.4S // ...................................................................................................................................................*.................. + mul v2.4S, v29.4S, v26.S[0] // ..........................................................................................................................................*........................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v24.4S, v29.4S, v26.S[1] // ...........................................................................................................................................*.......................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + str q21, [x1, #-32] // ..............................................................................................................................................................*....... + add v21.4S, v17.4S, v31.4S // ........................................................................................................................................................*............. + // gap // ...................................................................................................................................................................... + sub v17.4S, v17.4S, v31.4S // .......................................................................................................................................................*.............. + mul v28.4S, v1.4S, v26.S[0] // ...............................................................................................................................................*...................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v6.4S, v1.4S, v26.S[1] // ................................................................................................................................................*..................... + str q21, [x1, #-16] // ...............................................................................................................................................................*...... + add x1, x1, #64 // ....................................................................................................................................................................*. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q21, [x1, #0] // e..................................................................................................................................................................... + ldr q31, [x1, #16] // .e.................................................................................................................................................................... + mul v1.4S, v14.4S, v26.S[0] // ....................................................................................................................................................*................. + ldr q15, [x1, #32] // ..e................................................................................................................................................................... + ldr q29, [x1, #48] // ...e.................................................................................................................................................................. + // gap // ...................................................................................................................................................................... + sqrdmulh v14.4S, v14.4S, v26.S[1] // .....................................................................................................................................................*................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v13.4S, v21.4S, v31.4S // ....e................................................................................................................................................................. + mul v16.4S, v17.4S, v26.S[0] // .........................................................................................................................................................*............ + // gap // ...................................................................................................................................................................... + trn2 v21.4S, v21.4S, v31.4S // .....e................................................................................................................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v17.4S, v17.4S, v26.S[1] // ..........................................................................................................................................................*........... + trn1 v31.4S, v15.4S, v29.4S // ......e............................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn2 v26.4S, v15.4S, v29.4S // .......e.............................................................................................................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v2.4S, v24.4S, v8.S[0] // ............................................................................................................................................*......................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v24.2D, v13.2D, v31.2D // ..........e........................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v28.4S, v6.4S, v8.S[0] // .................................................................................................................................................*.................... + trn1 v6.2D, v21.2D, v26.2D // ...........e.......................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn2 v31.2D, v13.2D, v31.2D // ........e............................................................................................................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v1.4S, v14.4S, v8.S[0] // ......................................................................................................................................................*............... + trn2 v21.2D, v21.2D, v26.2D // .........e............................................................................................................................................................ + // gap // ...................................................................................................................................................................... + str q2, [x2], #(16*4) // ................................................................................................................................................................*..... + sub v14.4S, v24.4S, v6.4S // ..............................e....................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v16.4S, v17.4S, v8.S[0] // ...........................................................................................................................................................*.......... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + str q28, [x2, #-48] // .................................................................................................................................................................*.... + sub v26.4S, v31.4S, v21.4S // ...................................e.................................................................................................................................. + // gap // ...................................................................................................................................................................... + sqrdmulh v28.4S, v14.4S, v10.4S // .................................e.................................................................................................................................... + add v15.4S, v31.4S, v21.4S // ....................................e................................................................................................................................. + // gap // ...................................................................................................................................................................... + str q1, [x2, #-32] // ..................................................................................................................................................................*... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v2.4S, v14.4S, v3.4S // ................................e..................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + str q16, [x2, #-16] // ...................................................................................................................................................................*.. + add x2, x2, #64 // .....................................................................................................................................................................* + // gap // ...................................................................................................................................................................... + sqrdmulh v13.4S, v26.4S, v22.4S // ......................................e............................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + + // original source code + // ldr q9, [x1, #0] // .......................................................................................................................................e..............................|......................................................................................................................................e............................. + // ldr q10, [x1, #16] // ........................................................................................................................................e.............................|.......................................................................................................................................e............................ + // ldr q11, [x1, #32] // ..........................................................................................................................................e...........................|.........................................................................................................................................e.......................... + // ldr q12, [x1, #48] // ...........................................................................................................................................e..........................|..........................................................................................................................................e......................... + // trn1 v25.4s, v9.4s, v10.4s // .............................................................................................................................................e........................|............................................................................................................................................e....................... + // trn2 v26.4s, v9.4s, v10.4s // ...............................................................................................................................................e......................|..............................................................................................................................................e..................... + // trn1 v27.4s, v11.4s, v12.4s // .................................................................................................................................................e....................|................................................................................................................................................e................... + // trn2 v28.4s, v11.4s, v12.4s // ..................................................................................................................................................e...................|.................................................................................................................................................e.................. + // trn2 v11.2d, v25.2d, v27.2d // .......................................................................................................................................................e..............|......................................................................................................................................................e............. + // trn2 v12.2d, v26.2d, v28.2d // .........................................................................................................................................................e............|........................................................................................................................................................e........... + // trn1 v9.2d, v25.2d, v27.2d // ....................................................................................................................................................e.................|...................................................................................................................................................e................ + // trn1 v10.2d, v26.2d, v28.2d // ......................................................................................................................................................e...............|.....................................................................................................................................................e.............. + // ldr q13, [x2, #0] // ..*...................................................................................................................................................................|.*.................................................................................................................................................................. + // ldr q14, [x2, #16] // ...*..................................................................................................................................................................|..*................................................................................................................................................................. + // ldr q15, [x2, #32] // ......*...............................................................................................................................................................|.....*.............................................................................................................................................................. + // ldr q16, [x2, #48] // .........*............................................................................................................................................................|........*........................................................................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ...............*......................................................................................................................................................|..............*..................................................................................................................................................... + // trn2 v26.4s, v13.4s, v14.4s // .................*....................................................................................................................................................|................*................................................................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // .....................*................................................................................................................................................|....................*............................................................................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // .......................*..............................................................................................................................................|......................*............................................................................................................................................. + // trn2 v15.2d, v25.2d, v27.2d // ..............................*.......................................................................................................................................|.............................*...................................................................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // .................................*....................................................................................................................................|................................*................................................................................................................................... + // trn1 v13.2d, v25.2d, v27.2d // ...................................*..................................................................................................................................|..................................*................................................................................................................................. + // trn1 v14.2d, v26.2d, v28.2d // ...........................................*..........................................................................................................................|..........................................*......................................................................................................................... + // ldr q0, [x5], #(12*16) // ............*.........................................................................................................................................................|...........*........................................................................................................................................................ + // ldr q4, [x5, #(-12*16 + 1*16)] // .....*................................................................................................................................................................|....*............................................................................................................................................................... + // ldr q1, [x5, #(-12*16 + 2*16)] // e.....................................................................................................................................................................e.................................................................................................................................................................... + // ldr q5, [x5, #(-12*16 + 3*16)] // ..................................e...................................................................................................................................|.................................e.................................................................................................................................. + // ldr q2, [x5, #(-12*16 + 4*16)] // ....................................e.................................................................................................................................|...................................e................................................................................................................................ + // ldr q6, [x5, #(-12*16 + 5*16)] // .......................................e..............................................................................................................................|......................................e............................................................................................................................. + // sub v24.4s, v9.4s, v10.4s // ...........................................................................................................................................................e..........|..........................................................................................................................................................e......... + // add v9.4s, v9.4s, v10.4s // .*....................................................................................................................................................................|*................................................................................................................................................................... + // mul v10.4s, v24.4s, v1.4s // ..................................................................................................................................................................e...|.................................................................................................................................................................e.. + // sqrdmulh v24.4s, v24.4s, v5.4s // ...............................................................................................................................................................e......|..............................................................................................................................................................e..... + // mls v10.4s, v24.4s, v8.s[0] // ....*.................................................................................................................................................................|...*................................................................................................................................................................ + // sub v24.4s, v11.4s, v12.4s // ..............................................................................................................................................................e.......|.............................................................................................................................................................e...... + // add v11.4s, v11.4s, v12.4s // ................................................................................................................................................................e.....|...............................................................................................................................................................e.... + // mul v12.4s, v24.4s, v2.4s // ........*.............................................................................................................................................................|.......*............................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v6.4s // .....................................................................................................................................................................e|.................................................................................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ..............*.......................................................................................................................................................|.............*...................................................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ..........*...........................................................................................................................................................|.........*.......................................................................................................................................................... + // add v9.4s, v9.4s, v11.4s // ...........*..........................................................................................................................................................|..........*......................................................................................................................................................... + // mul v11.4s, v24.4s, v0.4s // .........................*............................................................................................................................................|........................*........................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ....................*.................................................................................................................................................|...................*................................................................................................................................................ + // mls v11.4s, v24.4s, v8.s[0] // ................................*.....................................................................................................................................|...............................*.................................................................................................................................... + // sub v24.4s, v10.4s, v12.4s // ...........................*..........................................................................................................................................|..........................*......................................................................................................................................... + // add v10.4s, v10.4s, v12.4s // .............................*........................................................................................................................................|............................*....................................................................................................................................... + // mul v12.4s, v24.4s, v0.4s // ..........................................*...........................................................................................................................|.........................................*.......................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // .....................................*................................................................................................................................|....................................*............................................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ..............................................*.......................................................................................................................|.............................................*...................................................................................................................... + // ldr q0, [x5, #(-12*16 + 6*16)] // .......*..............................................................................................................................................................|......*............................................................................................................................................................. + // ldr q4, [x5, #(-12*16 + 7*16)] // .............*........................................................................................................................................................|............*....................................................................................................................................................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ................*.....................................................................................................................................................|...............*.................................................................................................................................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // ...................*..................................................................................................................................................|..................*................................................................................................................................................. + // ldr q2, [x5, #(-12*16 + 10*16)] // ......................*...............................................................................................................................................|.....................*.............................................................................................................................................. + // ldr q6, [x5, #(-12*16 + 11*16)] // ..................*...................................................................................................................................................|.................*.................................................................................................................................................. + // sub v24.4s, v13.4s, v14.4s // ...............................................*......................................................................................................................|..............................................*..................................................................................................................... + // add v13.4s, v13.4s, v14.4s // ................................................*.....................................................................................................................|...............................................*.................................................................................................................... + // mul v14.4s, v24.4s, v1.4s // .....................................................*................................................................................................................|....................................................*............................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ..................................................*...................................................................................................................|.................................................*.................................................................................................................. + // mls v14.4s, v24.4s, v8.s[0] // ..........................................................*...........................................................................................................|.........................................................*.......................................................................................................... + // sub v24.4s, v15.4s, v16.4s // ........................................*.............................................................................................................................|.......................................*............................................................................................................................ + // add v15.4s, v15.4s, v16.4s // .............................................*........................................................................................................................|............................................*....................................................................................................................... + // mul v16.4s, v24.4s, v2.4s // ............................................*.........................................................................................................................|...........................................*........................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v6.4s // .................................................*....................................................................................................................|................................................*................................................................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ........................................................*.............................................................................................................|.......................................................*............................................................................................................ + // sub v24.4s, v13.4s, v15.4s // ...................................................*..................................................................................................................|..................................................*................................................................................................................. + // add v13.4s, v13.4s, v15.4s // ....................................................*.................................................................................................................|...................................................*................................................................................................................ + // mul v15.4s, v24.4s, v0.4s // ..............................................................*.......................................................................................................|.............................................................*...................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ................................................................*.....................................................................................................|...............................................................*.................................................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ............................................................................*.........................................................................................|...........................................................................*........................................................................................ + // sub v24.4s, v14.4s, v16.4s // ..................................................................*...................................................................................................|.................................................................*.................................................................................................. + // add v14.4s, v14.4s, v16.4s // ...................................................................*..................................................................................................|..................................................................*................................................................................................. + // mul v16.4s, v24.4s, v0.4s // ......................................................................*...............................................................................................|.....................................................................*.............................................................................................. + // sqrdmulh v24.4s, v24.4s, v4.4s // .........................................................................*............................................................................................|........................................................................*........................................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ...............................................................................*......................................................................................|..............................................................................*..................................................................................... + // trn1 v25.4s, v9.4s, v10.4s // ......................................*...............................................................................................................................|.....................................*.............................................................................................................................. + // trn2 v26.4s, v9.4s, v10.4s // .........................................*............................................................................................................................|........................................*........................................................................................................................... + // trn1 v27.4s, v11.4s, v12.4s // .......................................................*..............................................................................................................|......................................................*............................................................................................................. + // trn2 v28.4s, v11.4s, v12.4s // ......................................................*...............................................................................................................|.....................................................*.............................................................................................................. + // trn2 v11.2d, v25.2d, v27.2d // ...........................................................*..........................................................................................................|..........................................................*......................................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // .........................................................*............................................................................................................|........................................................*........................................................................................................... + // trn1 v9.2d, v25.2d, v27.2d // ............................................................*.........................................................................................................|...........................................................*........................................................................................................ + // trn1 v10.2d, v26.2d, v28.2d // .............................................................*........................................................................................................|............................................................*....................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ........................................................................*.............................................................................................|.......................................................................*............................................................................................ + // trn2 v26.4s, v13.4s, v14.4s // ..........................................................................*...........................................................................................|.........................................................................*.......................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ..................................................................................*...................................................................................|.................................................................................*.................................................................................. + // trn2 v28.4s, v15.4s, v16.4s // ....................................................................................*.................................................................................|...................................................................................*................................................................................ + // trn2 v15.2d, v25.2d, v27.2d // ......................................................................................*...............................................................................|.....................................................................................*.............................................................................. + // trn2 v16.2d, v26.2d, v28.2d // ..........................................................................................*...........................................................................|.........................................................................................*.......................................................................... + // trn1 v13.2d, v25.2d, v27.2d // .......................................................................................*..............................................................................|......................................................................................*............................................................................. + // trn1 v14.2d, v26.2d, v28.2d // .........................................................................................*............................................................................|........................................................................................*........................................................................... + // ldr q0, [x4], #64 // ........................*.............................................................................................................................................|.......................*............................................................................................................................................ + // ldr q1, [x4, #(-64 + 16)] // ..........................*...........................................................................................................................................|.........................*.......................................................................................................................................... + // ldr q2, [x4, #(-64 + 32)] // ............................*.........................................................................................................................................|...........................*........................................................................................................................................ + // ldr q3, [x4, #(-64 + 48)] // ...............................*......................................................................................................................................|..............................*..................................................................................................................................... + // sub v24.4s, v9.4s, v10.4s // .....................................................................*................................................................................................|....................................................................*............................................................................................... + // add v9.4s, v9.4s, v10.4s // .......................................................................*..............................................................................................|......................................................................*............................................................................................. + // mul v10.4s, v24.4s, v1.s[2] // ................................................................................*.....................................................................................|...............................................................................*.................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..............................................................................*.......................................................................................|.............................................................................*...................................................................................... + // mls v10.4s, v24.4s, v8.s[0] // .................................................................................*....................................................................................|................................................................................*................................................................................... + // sub v24.4s, v11.4s, v12.4s // ...............................................................*......................................................................................................|..............................................................*..................................................................................................... + // add v11.4s, v11.4s, v12.4s // .................................................................*....................................................................................................|................................................................*................................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ....................................................................*.................................................................................................|...................................................................*................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...................................................................................*..................................................................................|..................................................................................*................................................................................. + // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................................*..........................................................................|..........................................................................................*......................................................................... + // sub v24.4s, v13.4s, v14.4s // ............................................................................................*.........................................................................|...........................................................................................*........................................................................ + // add v13.4s, v13.4s, v14.4s // ..............................................................................................*.......................................................................|.............................................................................................*...................................................................... + // mul v14.4s, v24.4s, v2.s[2] // .................................................................................................*....................................................................|................................................................................................*................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...................................................................................................*..................................................................|..................................................................................................*................................................................. + // mls v14.4s, v24.4s, v8.s[0] // ...........................................................................................................*..........................................................|..........................................................................................................*......................................................... + // sub v24.4s, v15.4s, v16.4s // ...............................................................................................*......................................................................|..............................................................................................*..................................................................... + // add v15.4s, v15.4s, v16.4s // ................................................................................................*.....................................................................|...............................................................................................*.................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ........................................................................................................*.............................................................|.......................................................................................................*............................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .......................................................................................................*..............................................................|......................................................................................................*............................................................. + // mls v16.4s, v24.4s, v8.s[0] // ............................................................................................................*.........................................................|...........................................................................................................*........................................................ + // sub v24.4s, v9.4s, v11.4s // ...........................................................................*..........................................................................................|..........................................................................*......................................................................................... + // add v9.4s, v9.4s, v11.4s // .............................................................................*........................................................................................|............................................................................*....................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ........................................................................................*.............................................................................|.......................................................................................*............................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....................................................................................*................................................................................|....................................................................................*............................................................................... + // mls v11.4s, v24.4s, v8.s[0] // .............................................................................................*........................................................................|............................................................................................*....................................................................... + // sub v24.4s, v10.4s, v12.4s // ..................................................................................................*...................................................................|.................................................................................................*.................................................................. + // add v10.4s, v10.4s, v12.4s // ....................................................................................................*.................................................................|...................................................................................................*................................................................ + // mul v12.4s, v24.4s, v0.s[2] // ..............................................................................................................*.......................................................|.............................................................................................................*...................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................*..............................................|......................................................................................................................*............................................. + // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................................................................*..........................................|..........................................................................................................................*......................................... + // sub v24.4s, v13.4s, v15.4s // .....................................................................................................*................................................................|....................................................................................................*............................................................... + // add v13.4s, v13.4s, v15.4s // ......................................................................................................*...............................................................|.....................................................................................................*.............................................................. + // mul v15.4s, v24.4s, v1.s[0] // ...............................................................................................................*......................................................|..............................................................................................................*..................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .................................................................................................................*....................................................|................................................................................................................*................................................... + // mls v15.4s, v24.4s, v8.s[0] // .........................................................................................................................*............................................|........................................................................................................................*........................................... + // sub v24.4s, v14.4s, v16.4s // ................................................................................................................*.....................................................|...............................................................................................................*.................................................... + // add v14.4s, v14.4s, v16.4s // ..................................................................................................................*...................................................|.................................................................................................................*.................................................. + // mul v16.4s, v24.4s, v1.s[0] // ...................................................................................................................*..................................................|..................................................................................................................*................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .....................................................................................................................*................................................|....................................................................................................................*............................................... + // mls v16.4s, v24.4s, v8.s[0] // ..........................................................................................................................*...........................................|.........................................................................................................................*.......................................... + // sub v24.4s, v9.4s, v13.4s // .........................................................................................................*............................................................|........................................................................................................*........................................................... + // add v9.4s, v9.4s, v13.4s // ..........................................................................................................*...........................................................|.........................................................................................................*.......................................................... + // mul v13.4s, v24.4s, v0.s[0] // ..............................................................................................................................*.......................................|.............................................................................................................................*...................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................................................................................*......................................|..............................................................................................................................*..................................... + // mls v13.4s, v24.4s, v8.s[0] // ...................................................................................................................................................*..................|..................................................................................................................................................*................. + // sub v24.4s, v10.4s, v14.4s // ....................................................................................................................*.................................................|...................................................................................................................*................................................ + // add v10.4s, v10.4s, v14.4s // ......................................................................................................................*...............................................|.....................................................................................................................*.............................................. + // mul v14.4s, v24.4s, v0.s[0] // ...................................................................................................................................*..................................|..................................................................................................................................*................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ....................................................................................................................................*.................................|...................................................................................................................................*................................ + // mls v14.4s, v24.4s, v8.s[0] // .....................................................................................................................................................*................|....................................................................................................................................................*............... + // sub v24.4s, v11.4s, v15.4s // ............................................................................................................................*.........................................|...........................................................................................................................*........................................ + // add v11.4s, v11.4s, v15.4s // .............................................................................................................................*........................................|............................................................................................................................*....................................... + // mul v15.4s, v24.4s, v0.s[0] // .........................................................................................................................................*............................|........................................................................................................................................*........................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................*.........................|...........................................................................................................................................*........................ + // mls v15.4s, v24.4s, v8.s[0] // ........................................................................................................................................................*.............|.......................................................................................................................................................*............ + // sub v24.4s, v12.4s, v16.4s // ..................................................................................................................................*...................................|.................................................................................................................................*.................................. + // add v12.4s, v12.4s, v16.4s // .................................................................................................................................*....................................|................................................................................................................................*................................... + // mul v16.4s, v24.4s, v0.s[0] // ..............................................................................................................................................*.......................|.............................................................................................................................................*...................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................................................................................................................*.....................|...............................................................................................................................................*.................... + // mls v16.4s, v24.4s, v8.s[0] // ............................................................................................................................................................*.........|...........................................................................................................................................................*........ + // str q9, [x1], #(16*4) // .............................................................................................................*........................................................|............................................................................................................*....................................................... + // str q10, [x1, #(-16*4 + 1*16)] // ........................................................................................................................*.............................................|.......................................................................................................................*............................................ + // str q11, [x1, #(-16*4 + 2*16)] // ................................................................................................................................*.....................................|...............................................................................................................................*.................................... + // str q12, [x1, #(-16*4 + 3*16)] // .....................................................................................................................................*................................|....................................................................................................................................*............................... + // str q13, [x2], #(16*4) // ..........................................................................................................................................................*...........|.........................................................................................................................................................*.......... + // str q14, [x2, #(-16*4 + 1*16)] // .............................................................................................................................................................*........|............................................................................................................................................................*....... + // str q15, [x2, #(-16*4 + 2*16)] // .................................................................................................................................................................*....|................................................................................................................................................................*... + // str q16, [x2, #(-16*4 + 3*16)] // ...................................................................................................................................................................*..|..................................................................................................................................................................*. + // add x1, x1, #64 // ......................................................................................................................................*...............................|.....................................................................................................................................*.............................. + // add x2, x2, #64 // ....................................................................................................................................................................*.|...................................................................................................................................................................* + + sub count, count, #1 + cbnz count, layer45678_start + add v17.4S, v24.4S, v6.4S // *............................................................................................................................................... + mls v2.4S, v28.4S, v8.S[0] // ...*............................................................................................................................................ + ldr q21, [x2, #0] // .*.............................................................................................................................................. + ldr q14, [x2, #16] // ..*............................................................................................................................................. + ldr q3, [x5, #16] // ....*........................................................................................................................................... + // gap // ................................................................................................................................................ + mul v24.4S, v26.4S, v30.4S // .......*........................................................................................................................................ + ldr q28, [x2, #32] // .....*.......................................................................................................................................... + ldr q6, [x5, #96] // ......*......................................................................................................................................... + ldr q31, [x2, #48] // ........*....................................................................................................................................... + sub v26.4S, v17.4S, v15.4S // .........*...................................................................................................................................... + ldr q1, [x5], #(12*16) // ...........*.................................................................................................................................... + add v17.4S, v17.4S, v15.4S // ..........*..................................................................................................................................... + mls v24.4S, v13.4S, v8.S[0] // .............*.................................................................................................................................. + ldr q15, [x5, #-80] // ............*................................................................................................................................... + trn1 v29.4S, v21.4S, v14.4S // ..............*................................................................................................................................. + ldr q13, [x5, #-64] // ...............*................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v21.4S, v21.4S, v14.4S // ................*............................................................................................................................... + sqrdmulh v14.4S, v26.4S, v3.4S // ...................*............................................................................................................................ + ldr q16, [x5, #-16] // .................*.............................................................................................................................. + trn1 v9.4S, v28.4S, v31.4S // ....................*........................................................................................................................... + ldr q0, [x5, #-48] // ..................*............................................................................................................................. + // gap // ................................................................................................................................................ + trn2 v28.4S, v28.4S, v31.4S // ......................*......................................................................................................................... + mul v31.4S, v26.4S, v1.4S // ........................*....................................................................................................................... + ldr q26, [x5, #-32] // .....................*.......................................................................................................................... + sub v30.4S, v2.4S, v24.4S // ..........................*..................................................................................................................... + ldr q7, [x4], #64 // .......................*........................................................................................................................ + // gap // ................................................................................................................................................ + add v2.4S, v2.4S, v24.4S // ............................*................................................................................................................... + trn2 v24.2D, v29.2D, v9.2D // .............................*.................................................................................................................. + ldr q18, [x4, #-48] // .........................*...................................................................................................................... + mls v31.4S, v14.4S, v8.S[0] // ...............................*................................................................................................................ + ldr q25, [x4, #-32] // ...........................*.................................................................................................................... + trn2 v14.2D, v21.2D, v28.2D // ................................*............................................................................................................... + trn1 v29.2D, v29.2D, v9.2D // .................................*.............................................................................................................. + ldr q9, [x4, #-16] // ..............................*................................................................................................................. + // gap // ................................................................................................................................................ + sqrdmulh v3.4S, v30.4S, v3.4S // ..................................*............................................................................................................. + trn1 v12.4S, v17.4S, v2.4S // ...................................*............................................................................................................ + // gap // ................................................................................................................................................ + sub v23.4S, v24.4S, v14.4S // ....................................*........................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v1.4S, v30.4S, v1.4S // ......................................*......................................................................................................... + trn2 v17.4S, v17.4S, v2.4S // .....................................*.......................................................................................................... + // gap // ................................................................................................................................................ + trn1 v21.2D, v21.2D, v28.2D // .......................................*........................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v2.4S, v23.4S, v26.4S // ........................................*....................................................................................................... + add v14.4S, v24.4S, v14.4S // .........................................*...................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v1.4S, v3.4S, v8.S[0] // ..........................................*..................................................................................................... + sub v3.4S, v29.4S, v21.4S // ...........................................*.................................................................................................... + // gap // ................................................................................................................................................ + add v21.4S, v29.4S, v21.4S // ............................................*................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v24.4S, v23.4S, v16.4S // .............................................*.................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v28.4S, v3.4S, v0.4S // ..............................................*................................................................................................. + sub v26.4S, v21.4S, v14.4S // ...............................................*................................................................................................ + // gap // ................................................................................................................................................ + trn1 v29.4S, v31.4S, v1.4S // ...................................................*............................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v3.4S, v3.4S, v13.4S // .................................................*.............................................................................................. + add v21.4S, v21.4S, v14.4S // ................................................*............................................................................................... + // gap // ................................................................................................................................................ + trn2 v14.4S, v31.4S, v1.4S // ..................................................*............................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v2.4S, v24.4S, v8.S[0] // ....................................................*........................................................................................... + trn1 v24.2D, v12.2D, v29.2D // ........................................................*....................................................................................... + // gap // ................................................................................................................................................ + trn2 v31.2D, v12.2D, v29.2D // .......................................................*........................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v3.4S, v28.4S, v8.S[0] // ......................................................*......................................................................................... + trn2 v28.2D, v17.2D, v14.2D // .....................................................*.......................................................................................... + // gap // ................................................................................................................................................ + trn1 v17.2D, v17.2D, v14.2D // .........................................................*...................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v14.4S, v26.4S, v6.4S // ..........................................................*..................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v1.4S, v31.4S, v28.4S // ...........................................................*.................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v26.4S, v26.4S, v15.4S // ............................................................*................................................................................... + add v28.4S, v31.4S, v28.4S // .............................................................*.................................................................................. + // gap // ................................................................................................................................................ + sub v31.4S, v3.4S, v2.4S // ..............................................................*................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v2.4S, v3.4S, v2.4S // ...............................................................*................................................................................ + mul v3.4S, v1.4S, v25.S[0] // ................................................................*............................................................................... + // gap // ................................................................................................................................................ + sub v29.4S, v24.4S, v17.4S // .................................................................*.............................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v6.4S, v31.4S, v6.4S // ..................................................................*............................................................................. + add v17.4S, v24.4S, v17.4S // ...................................................................*............................................................................ + // gap // ................................................................................................................................................ + trn1 v24.4S, v21.4S, v2.4S // ....................................................................*........................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v31.4S, v31.4S, v15.4S // .....................................................................*.......................................................................... + trn2 v21.4S, v21.4S, v2.4S // ......................................................................*......................................................................... + // gap // ................................................................................................................................................ + sub v2.4S, v17.4S, v28.4S // .......................................................................*........................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v14.4S, v26.4S, v8.S[0] // ........................................................................*....................................................................... + add v17.4S, v17.4S, v28.4S // .........................................................................*...................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v28.4S, v29.4S, v18.S[3] // ..........................................................................*..................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v6.4S, v31.4S, v8.S[0] // ...........................................................................*.................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v31.4S, v29.4S, v18.S[2] // ............................................................................*................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v31.4S, v28.4S, v8.S[0] // .............................................................................*.................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v28.4S, v14.4S, v6.4S // ..............................................................................*................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v26.4S, v1.4S, v25.S[1] // ...............................................................................*................................................................ + trn2 v14.4S, v14.4S, v6.4S // ................................................................................*............................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v6.4S, v2.4S, v7.S[3] // .................................................................................*.............................................................. + trn2 v1.2D, v24.2D, v28.2D // ..................................................................................*............................................................. + // gap // ................................................................................................................................................ + trn1 v24.2D, v24.2D, v28.2D // ...................................................................................*............................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v2.4S, v2.4S, v7.S[2] // ....................................................................................*........................................................... + trn1 v28.2D, v21.2D, v14.2D // .....................................................................................*.......................................................... + // gap // ................................................................................................................................................ + trn2 v21.2D, v21.2D, v14.2D // ......................................................................................*......................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v3.4S, v26.4S, v8.S[0] // .......................................................................................*........................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v14.4S, v24.4S, v28.4S // ........................................................................................*....................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v2.4S, v6.4S, v8.S[0] // .........................................................................................*...................................................... + add v24.4S, v24.4S, v28.4S // ..........................................................................................*..................................................... + // gap // ................................................................................................................................................ + sub v28.4S, v1.4S, v21.4S // ...........................................................................................*.................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v21.4S, v1.4S, v21.4S // ............................................................................................*................................................... + mul v6.4S, v14.4S, v25.S[2] // .............................................................................................*.................................................. + // gap // ................................................................................................................................................ + sub v26.4S, v31.4S, v3.4S // ..............................................................................................*................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v14.4S, v14.4S, v25.S[3] // ...............................................................................................*................................................ + add v3.4S, v31.4S, v3.4S // ................................................................................................*............................................... + // gap // ................................................................................................................................................ + sub v31.4S, v24.4S, v21.4S // .................................................................................................*.............................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v21.4S, v24.4S, v21.4S // ..................................................................................................*............................................. + sqrdmulh v24.4S, v28.4S, v9.S[1] // ...................................................................................................*............................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v28.4S, v28.4S, v9.S[0] // ....................................................................................................*........................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v1.4S, v17.4S, v21.4S // .....................................................................................................*.......................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v17.4S, v17.4S, v21.4S // ......................................................................................................*......................................... + mls v6.4S, v14.4S, v8.S[0] // .......................................................................................................*........................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v28.4S, v24.4S, v8.S[0] // ........................................................................................................*....................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q17, [x1], #(16*4) // .........................................................................................................*...................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v17.4S, v26.4S, v7.S[2] // ..........................................................................................................*..................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v21.4S, v31.4S, v18.S[0] // ...........................................................................................................*.................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v14.4S, v6.4S, v28.4S // ............................................................................................................*................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v24.4S, v31.4S, v18.S[1] // .............................................................................................................*.................................. + add v28.4S, v6.4S, v28.4S // ..............................................................................................................*................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v6.4S, v14.4S, v18.S[0] // ...............................................................................................................*................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v31.4S, v3.4S, v28.4S // ................................................................................................................*............................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v14.4S, v14.4S, v18.S[1] // .................................................................................................................*.............................. + add v3.4S, v3.4S, v28.4S // ..................................................................................................................*............................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v28.4S, v26.4S, v7.S[3] // ...................................................................................................................*............................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q3, [x1, #-48] // ....................................................................................................................*........................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v21.4S, v24.4S, v8.S[0] // .....................................................................................................................*.......................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v6.4S, v14.4S, v8.S[0] // ......................................................................................................................*......................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v17.4S, v28.4S, v8.S[0] // .......................................................................................................................*........................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v14.4S, v2.4S, v21.4S // ........................................................................................................................*....................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v21.4S, v2.4S, v21.4S // .........................................................................................................................*...................... + mul v2.4S, v1.4S, v7.S[0] // ..........................................................................................................................*..................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v3.4S, v1.4S, v7.S[1] // ...........................................................................................................................*.................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q21, [x1, #-32] // ............................................................................................................................*................... + add v21.4S, v17.4S, v6.4S // .............................................................................................................................*.................. + // gap // ................................................................................................................................................ + sub v17.4S, v17.4S, v6.4S // ..............................................................................................................................*................. + mul v24.4S, v31.4S, v7.S[0] // ...............................................................................................................................*................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q21, [x1, #-16] // .................................................................................................................................*.............. + add x1, x1, #64 // ..................................................................................................................................*............. + sqrdmulh v21.4S, v31.4S, v7.S[1] // ................................................................................................................................*............... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v28.4S, v14.4S, v7.S[0] // ...................................................................................................................................*............ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v14.4S, v14.4S, v7.S[1] // ....................................................................................................................................*........... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v6.4S, v17.4S, v7.S[0] // .....................................................................................................................................*.......... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v17.4S, v17.4S, v7.S[1] // ......................................................................................................................................*......... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v2.4S, v3.4S, v8.S[0] // .......................................................................................................................................*........ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v24.4S, v21.4S, v8.S[0] // ........................................................................................................................................*....... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v28.4S, v14.4S, v8.S[0] // .........................................................................................................................................*...... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q2, [x2], #(16*4) // ..........................................................................................................................................*..... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v6.4S, v17.4S, v8.S[0] // ...........................................................................................................................................*.... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q24, [x2, #-48] // ............................................................................................................................................*... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q28, [x2, #-32] // .............................................................................................................................................*.. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q6, [x2, #-16] // ..............................................................................................................................................*. + add x2, x2, #64 // ...............................................................................................................................................* + // gap // ................................................................................................................................................ + + // original source code + // add v17.4S, v24.4S, v6.4S // *............................................................................................................................................... + // ldr q21, [x2, #0] // ..*............................................................................................................................................. + // ldr q14, [x2, #16] // ...*............................................................................................................................................ + // mls v2.4S, v28.4S, v8.S[0] // .*.............................................................................................................................................. + // ldr q24, [x5, #16] // ....*........................................................................................................................................... + // ldr q28, [x2, #32] // ......*......................................................................................................................................... + // ldr q6, [x5, #96] // .......*........................................................................................................................................ + // mul v31.4S, v26.4S, v30.4S // .....*.......................................................................................................................................... + // ldr q26, [x2, #48] // ........*....................................................................................................................................... + // sub v1.4S, v17.4S, v15.4S // .........*...................................................................................................................................... + // add v17.4S, v17.4S, v15.4S // ...........*.................................................................................................................................... + // ldr q15, [x5], #(12*16) // ..........*..................................................................................................................................... + // ldr q29, [x5, #-80] // .............*.................................................................................................................................. + // mls v31.4S, v13.4S, v8.S[0] // ............*................................................................................................................................... + // trn1 v13.4S, v21.4S, v14.4S // ..............*................................................................................................................................. + // ldr q16, [x5, #-64] // ...............*................................................................................................................................ + // trn2 v21.4S, v21.4S, v14.4S // ................*............................................................................................................................... + // ldr q14, [x5, #-16] // ..................*............................................................................................................................. + // ldr q9, [x5, #-48] // ....................*........................................................................................................................... + // sqrdmulh v0.4S, v1.4S, v24.4S // .................*.............................................................................................................................. + // trn1 v30.4S, v28.4S, v26.4S // ...................*............................................................................................................................ + // ldr q7, [x5, #-32] // .......................*........................................................................................................................ + // trn2 v28.4S, v28.4S, v26.4S // .....................*.......................................................................................................................... + // ldr q26, [x4], #64 // .........................*...................................................................................................................... + // mul v1.4S, v1.4S, v15.4S // ......................*......................................................................................................................... + // ldr q18, [x4, #-48] // ............................*................................................................................................................... + // sub v25.4S, v2.4S, v31.4S // ........................*....................................................................................................................... + // ldr q12, [x4, #-32] // ..............................*................................................................................................................. + // add v2.4S, v2.4S, v31.4S // ..........................*..................................................................................................................... + // trn2 v31.2D, v13.2D, v30.2D // ...........................*.................................................................................................................... + // ldr q23, [x4, #-16] // .................................*.............................................................................................................. + // mls v1.4S, v0.4S, v8.S[0] // .............................*.................................................................................................................. + // trn2 v0.2D, v21.2D, v28.2D // ...............................*................................................................................................................ + // trn1 v13.2D, v13.2D, v30.2D // ................................*............................................................................................................... + // sqrdmulh v24.4S, v25.4S, v24.4S // ..................................*............................................................................................................. + // trn1 v4.4S, v17.4S, v2.4S // ...................................*............................................................................................................ + // sub v11.4S, v31.4S, v0.4S // ....................................*........................................................................................................... + // trn2 v17.4S, v17.4S, v2.4S // ......................................*......................................................................................................... + // mul v2.4S, v25.4S, v15.4S // .....................................*.......................................................................................................... + // trn1 v21.2D, v21.2D, v28.2D // .......................................*........................................................................................................ + // mul v28.4S, v11.4S, v7.4S // ........................................*....................................................................................................... + // add v31.4S, v31.4S, v0.4S // .........................................*...................................................................................................... + // mls v2.4S, v24.4S, v8.S[0] // ..........................................*..................................................................................................... + // sub v24.4S, v13.4S, v21.4S // ...........................................*.................................................................................................... + // add v21.4S, v13.4S, v21.4S // ............................................*................................................................................................... + // sqrdmulh v14.4S, v11.4S, v14.4S // .............................................*.................................................................................................. + // sqrdmulh v15.4S, v24.4S, v9.4S // ..............................................*................................................................................................. + // sub v13.4S, v21.4S, v31.4S // ...............................................*................................................................................................ + // add v21.4S, v21.4S, v31.4S // ..................................................*............................................................................................. + // mul v24.4S, v24.4S, v16.4S // .................................................*.............................................................................................. + // trn2 v31.4S, v1.4S, v2.4S // ...................................................*............................................................................................ + // trn1 v2.4S, v1.4S, v2.4S // ................................................*............................................................................................... + // mls v28.4S, v14.4S, v8.S[0] // ....................................................*........................................................................................... + // trn2 v14.2D, v17.2D, v31.2D // ........................................................*....................................................................................... + // mls v24.4S, v15.4S, v8.S[0] // .......................................................*........................................................................................ + // trn2 v1.2D, v4.2D, v2.2D // ......................................................*......................................................................................... + // trn1 v2.2D, v4.2D, v2.2D // .....................................................*.......................................................................................... + // trn1 v17.2D, v17.2D, v31.2D // .........................................................*...................................................................................... + // mul v31.4S, v13.4S, v6.4S // ..........................................................*..................................................................................... + // sub v15.4S, v1.4S, v14.4S // ...........................................................*.................................................................................... + // sqrdmulh v13.4S, v13.4S, v29.4S // ............................................................*................................................................................... + // add v14.4S, v1.4S, v14.4S // .............................................................*.................................................................................. + // sub v1.4S, v24.4S, v28.4S // ..............................................................*................................................................................. + // add v24.4S, v24.4S, v28.4S // ...............................................................*................................................................................ + // mul v28.4S, v15.4S, v12.S[0] // ................................................................*............................................................................... + // sub v16.4S, v2.4S, v17.4S // .................................................................*.............................................................................. + // mul v6.4S, v1.4S, v6.4S // ..................................................................*............................................................................. + // add v17.4S, v2.4S, v17.4S // ...................................................................*............................................................................ + // trn1 v2.4S, v21.4S, v24.4S // ....................................................................*........................................................................... + // sqrdmulh v1.4S, v1.4S, v29.4S // .....................................................................*.......................................................................... + // trn2 v21.4S, v21.4S, v24.4S // ......................................................................*......................................................................... + // sub v24.4S, v17.4S, v14.4S // .......................................................................*........................................................................ + // mls v31.4S, v13.4S, v8.S[0] // ........................................................................*....................................................................... + // add v17.4S, v17.4S, v14.4S // .........................................................................*...................................................................... + // sqrdmulh v14.4S, v16.4S, v18.S[3] // ..........................................................................*..................................................................... + // mls v6.4S, v1.4S, v8.S[0] // ...........................................................................*.................................................................... + // mul v1.4S, v16.4S, v18.S[2] // ............................................................................*................................................................... + // mls v1.4S, v14.4S, v8.S[0] // .............................................................................*.................................................................. + // trn1 v14.4S, v31.4S, v6.4S // ..............................................................................*................................................................. + // sqrdmulh v15.4S, v15.4S, v12.S[1] // ...............................................................................*................................................................ + // trn2 v6.4S, v31.4S, v6.4S // ................................................................................*............................................................... + // sqrdmulh v31.4S, v24.4S, v26.S[3] // .................................................................................*.............................................................. + // trn2 v29.2D, v2.2D, v14.2D // ..................................................................................*............................................................. + // trn1 v14.2D, v2.2D, v14.2D // ...................................................................................*............................................................ + // mul v2.4S, v24.4S, v26.S[2] // ....................................................................................*........................................................... + // trn1 v24.2D, v21.2D, v6.2D // .....................................................................................*.......................................................... + // trn2 v21.2D, v21.2D, v6.2D // ......................................................................................*......................................................... + // mls v28.4S, v15.4S, v8.S[0] // .......................................................................................*........................................................ + // sub v6.4S, v14.4S, v24.4S // ........................................................................................*....................................................... + // mls v2.4S, v31.4S, v8.S[0] // .........................................................................................*...................................................... + // add v14.4S, v14.4S, v24.4S // ..........................................................................................*..................................................... + // sub v24.4S, v29.4S, v21.4S // ...........................................................................................*.................................................... + // add v21.4S, v29.4S, v21.4S // ............................................................................................*................................................... + // mul v31.4S, v6.4S, v12.S[2] // .............................................................................................*.................................................. + // sub v15.4S, v1.4S, v28.4S // ..............................................................................................*................................................. + // sqrdmulh v6.4S, v6.4S, v12.S[3] // ...............................................................................................*................................................ + // add v28.4S, v1.4S, v28.4S // ................................................................................................*............................................... + // sub v1.4S, v14.4S, v21.4S // .................................................................................................*.............................................. + // add v21.4S, v14.4S, v21.4S // ..................................................................................................*............................................. + // sqrdmulh v14.4S, v24.4S, v23.S[1] // ...................................................................................................*............................................ + // mul v24.4S, v24.4S, v23.S[0] // ....................................................................................................*........................................... + // sub v29.4S, v17.4S, v21.4S // .....................................................................................................*.......................................... + // add v17.4S, v17.4S, v21.4S // ......................................................................................................*......................................... + // mls v31.4S, v6.4S, v8.S[0] // .......................................................................................................*........................................ + // mls v24.4S, v14.4S, v8.S[0] // ........................................................................................................*....................................... + // str q17, [x1], #(16*4) // .........................................................................................................*...................................... + // mul v17.4S, v15.4S, v26.S[2] // ..........................................................................................................*..................................... + // mul v21.4S, v1.4S, v18.S[0] // ...........................................................................................................*.................................... + // sub v14.4S, v31.4S, v24.4S // ............................................................................................................*................................... + // sqrdmulh v6.4S, v1.4S, v18.S[1] // .............................................................................................................*.................................. + // add v24.4S, v31.4S, v24.4S // ..............................................................................................................*................................. + // mul v31.4S, v14.4S, v18.S[0] // ...............................................................................................................*................................ + // sub v1.4S, v28.4S, v24.4S // ................................................................................................................*............................... + // sqrdmulh v14.4S, v14.4S, v18.S[1] // .................................................................................................................*.............................. + // add v24.4S, v28.4S, v24.4S // ..................................................................................................................*............................. + // sqrdmulh v28.4S, v15.4S, v26.S[3] // ...................................................................................................................*............................ + // str q24, [x1, #-48] // ....................................................................................................................*........................... + // mls v21.4S, v6.4S, v8.S[0] // .....................................................................................................................*.......................... + // mls v31.4S, v14.4S, v8.S[0] // ......................................................................................................................*......................... + // mls v17.4S, v28.4S, v8.S[0] // .......................................................................................................................*........................ + // sub v14.4S, v2.4S, v21.4S // ........................................................................................................................*....................... + // add v21.4S, v2.4S, v21.4S // .........................................................................................................................*...................... + // mul v2.4S, v29.4S, v26.S[0] // ..........................................................................................................................*..................... + // sqrdmulh v24.4S, v29.4S, v26.S[1] // ...........................................................................................................................*.................... + // str q21, [x1, #-32] // ............................................................................................................................*................... + // add v21.4S, v17.4S, v31.4S // .............................................................................................................................*.................. + // sub v17.4S, v17.4S, v31.4S // ..............................................................................................................................*................. + // mul v28.4S, v1.4S, v26.S[0] // ...............................................................................................................................*................ + // sqrdmulh v6.4S, v1.4S, v26.S[1] // ..................................................................................................................................*............. + // str q21, [x1, #-16] // ................................................................................................................................*............... + // add x1, x1, #64 // .................................................................................................................................*.............. + // mul v1.4S, v14.4S, v26.S[0] // ...................................................................................................................................*............ + // sqrdmulh v14.4S, v14.4S, v26.S[1] // ....................................................................................................................................*........... + // mul v16.4S, v17.4S, v26.S[0] // .....................................................................................................................................*.......... + // sqrdmulh v17.4S, v17.4S, v26.S[1] // ......................................................................................................................................*......... + // mls v2.4S, v24.4S, v8.S[0] // .......................................................................................................................................*........ + // mls v28.4S, v6.4S, v8.S[0] // ........................................................................................................................................*....... + // mls v1.4S, v14.4S, v8.S[0] // .........................................................................................................................................*...... + // str q2, [x2], #(16*4) // ..........................................................................................................................................*..... + // mls v16.4S, v17.4S, v8.S[0] // ...........................................................................................................................................*.... + // str q28, [x2, #-48] // ............................................................................................................................................*... + // str q1, [x2, #-32] // .............................................................................................................................................*.. + // str q16, [x2, #-16] // ..............................................................................................................................................*. + // add x2, x2, #64 // ...............................................................................................................................................* + + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + ldr q21, [x0, #768] // .*........... + ldr q17, [x0, #896] // ...*......... + // gap // ............. + ldr q6, [x0, #256] // *............ + // gap // ............. + // gap // ............. + ldr q29, [x0, #384] // ..*.......... + // gap // ............. + // gap // ............. + ldr q9, [x0, #512] // ....*........ + // gap // ............. + // gap // ............. + add v23.4S, v21.4S, v17.4S // .......*..... + sub v21.4S, v21.4S, v17.4S // ......*...... + ldr q7, [x0, #640] // .....*....... + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + sqrdmulh v17.4S, v21.4S, v3.S[1] // .........*... + // gap // ............. + // gap // ............. + sub v30.4S, v9.4S, v7.4S // ........*.... + // gap // ............. + // gap // ............. + mul v10.4S, v21.4S, v3.S[0] // ..........*.. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + sqrdmulh v18.4S, v30.4S, v2.S[3] // ...........*. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + mls v10.4S, v17.4S, v8.S[0] // ............* + // gap // ............. + // gap // ............. + + // original source code + // ldr q6, [x0, #256] // ..*.......... + // ldr q15, [x0, #768] // *............ + // ldr q29, [x0, #384] // ...*......... + // ldr q13, [x0, #896] // .*........... + // ldr q9, [x0, #512] // ....*........ + // ldr q7, [x0, #640] // .......*..... + // sub v18.4S, v15.4S, v13.4S // ......*...... + // add v23.4S, v15.4S, v13.4S // .....*....... + // sub v30.4S, v9.4S, v7.4S // .........*... + // sqrdmulh v21.4S, v18.4S, v3.S[1] // ........*.... + // mul v10.4S, v18.4S, v3.S[0] // ..........*.. + // sqrdmulh v18.4S, v30.4S, v2.S[3] // ...........*. + // mls v10.4S, v21.4S, v8.S[0] // ............* + + sub count, count, #1 +layer123_start: + ldr q17, [x0, #0] // *............................................................................................... + ldr q21, [x0, #128] // .*.............................................................................................. + sub v14.4S, v6.4S, v29.4S // .............*.................................................................................. + mul v24.4S, v30.4S, v2.S[2] // ....................*........................................................................... + add v28.4S, v6.4S, v29.4S // ..............*................................................................................. + ldr q6, [x0, #272] // ..e............................................................................................. + add v31.4S, v9.4S, v7.4S // ...................*............................................................................ + ldr q15, [x0, #784] // ......e......................................................................................... + ldr q29, [x0, #400] // ...e............................................................................................ + ldr q13, [x0, #912] // .......e........................................................................................ + mul v16.4S, v14.4S, v2.S[0] // ...............*................................................................................ + ldr q9, [x0, #528] // ....e........................................................................................... + sub v30.4S, v17.4S, v21.4S // ........*....................................................................................... + ldr q7, [x0, #656] // .....e.......................................................................................... + // gap // ................................................................................................ + add v17.4S, v17.4S, v21.4S // .........*...................................................................................... + sqrdmulh v21.4S, v14.4S, v2.S[1] // ................*............................................................................... + // gap // ................................................................................................ + sub v14.4S, v31.4S, v23.4S // ......................................*......................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v24.4S, v18.4S, v8.S[0] // ......................*......................................................................... + sub v18.4S, v15.4S, v13.4S // .......................e........................................................................ + // gap // ................................................................................................ + sub v12.4S, v17.4S, v28.4S // ............................*................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + add v17.4S, v17.4S, v28.4S // .............................*.................................................................. + mul v28.4S, v30.4S, v1.S[2] // ..........*..................................................................................... + // gap // ................................................................................................ + add v31.4S, v31.4S, v23.4S // .......................................*........................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + add v23.4S, v15.4S, v13.4S // ........................e....................................................................... + mls v16.4S, v21.4S, v8.S[0] // .................*.............................................................................. + // gap // ................................................................................................ + sub v21.4S, v24.4S, v10.4S // ...........................................*.................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v15.4S, v30.4S, v1.S[3] // ...........*.................................................................................... + add v24.4S, v24.4S, v10.4S // ............................................*................................................... + // gap // ................................................................................................ + sub v13.4S, v17.4S, v31.4S // ................................................*............................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + add v17.4S, v17.4S, v31.4S // .................................................*.............................................. + mul v31.4S, v12.4S, v0.S[2] // ..............................*................................................................. + // gap // ................................................................................................ + sub v30.4S, v9.4S, v7.4S // ..................e............................................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v12.4S, v12.4S, v0.S[3] // ...............................*................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v10.4S, v14.4S, v1.S[0] // ........................................*....................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v14.4S, v14.4S, v1.S[1] // .........................................*...................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v28.4S, v15.4S, v8.S[0] // ............*................................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v31.4S, v12.4S, v8.S[0] // ................................*............................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v10.4S, v14.4S, v8.S[0] // ..........................................*..................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + sub v14.4S, v28.4S, v16.4S // .................................*.............................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + add v28.4S, v28.4S, v16.4S // ..................................*............................................................. + mul v15.4S, v21.4S, v1.S[0] // .............................................*.................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v16.4S, v14.4S, v0.S[2] // ...................................*............................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sub v12.4S, v28.4S, v24.4S // .....................................................*.......................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + add v24.4S, v28.4S, v24.4S // ......................................................*......................................... + sqrdmulh v14.4S, v14.4S, v0.S[3] // ....................................*........................................................... + // gap // ................................................................................................ + sub v28.4S, v31.4S, v10.4S // ..........................................................*..................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v21.4S, v21.4S, v1.S[1] // ..............................................*................................................. + add v31.4S, v31.4S, v10.4S // ...........................................................*.................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v10.4S, v13.4S, v0.S[0] // ..................................................*............................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v16.4S, v14.4S, v8.S[0] // .....................................*.......................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v15.4S, v21.4S, v8.S[0] // ...............................................*................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v21.4S, v13.4S, v0.S[1] // ...................................................*............................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v14.4S, v12.4S, v0.S[0] // .......................................................*........................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sub v13.4S, v16.4S, v15.4S // ...............................................................*................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + add v15.4S, v16.4S, v15.4S // ................................................................*............................... + mul v16.4S, v17.4S, v25.4S // ................................................................................*............... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v17.4S, v17.4S, v26.4S // .................................................................................*.............. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v10.4S, v21.4S, v8.S[0] // ....................................................*........................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v21.4S, v12.4S, v0.S[1] // ........................................................*....................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v12.4S, v28.4S, v0.S[0] // ............................................................*................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + srshr v4.4S, v10.4S, #23 // ....................................................................*........................... + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v28.4S, v28.4S, v0.S[1] // .............................................................*.................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v14.4S, v21.4S, v8.S[0] // .........................................................*...................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v21.4S, v13.4S, v0.S[0] // .................................................................*.............................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v12.4S, v28.4S, v8.S[0] // ..............................................................*................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + srshr v28.4S, v14.4S, #23 // ......................................................................*......................... + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v13.4S, v13.4S, v0.S[1] // ..................................................................*............................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v10.4S, v4.4S, v8.4S // .....................................................................*.......................... + // gap // ................................................................................................ + // gap // ................................................................................................ + srshr v4.4S, v12.4S, #23 // ........................................................................*....................... + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v14.4S, v28.4S, v8.4S // .......................................................................*........................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v21.4S, v13.4S, v8.S[0] // ...................................................................*............................ + // gap // ................................................................................................ + // gap // ................................................................................................ + str q10, [x0, #512] // ............................................................................*................... + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v12.4S, v4.4S, v8.4S // .........................................................................*...................... + // gap // ................................................................................................ + // gap // ................................................................................................ + str q14, [x0, #640] // .............................................................................*.................. + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v16.4S, v17.4S, v8.S[0] // ..................................................................................*............. + // gap // ................................................................................................ + // gap // ................................................................................................ + srshr v17.4S, v21.4S, #23 // ..........................................................................*..................... + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v14.4S, v24.4S, v25.4S // ...................................................................................*............ + // gap // ................................................................................................ + // gap // ................................................................................................ + str q12, [x0, #768] // ..............................................................................*................. + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v21.4S, v17.4S, v8.4S // ...........................................................................*.................... + // gap // ................................................................................................ + // gap // ................................................................................................ + str q16, [x0], #(16) // ............................................................................................*... + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v17.4S, v24.4S, v26.4S // ....................................................................................*........... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v24.4S, v31.4S, v25.4S // ......................................................................................*......... + // gap // ................................................................................................ + // gap // ................................................................................................ + str q21, [x0, #880] // ...............................................................................*................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v21.4S, v31.4S, v26.4S // .......................................................................................*........ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v14.4S, v17.4S, v8.S[0] // .....................................................................................*.......... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v17.4S, v15.4S, v25.4S // .........................................................................................*...... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v28.4S, v15.4S, v26.4S // ..........................................................................................*..... + // gap // ................................................................................................ + // gap // ................................................................................................ + str q14, [x0, #112] // .............................................................................................*.. + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v24.4S, v21.4S, v8.S[0] // ........................................................................................*....... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v21.4S, v18.4S, v3.S[1] // ..........................e..................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v17.4S, v28.4S, v8.S[0] // ...........................................................................................*.... + // gap // ................................................................................................ + // gap // ................................................................................................ + str q24, [x0, #240] // ..............................................................................................*. + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v10.4S, v18.4S, v3.S[0] // .........................e...................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v18.4S, v30.4S, v2.S[3] // .....................e.......................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + str q17, [x0, #368] // ...............................................................................................* + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v10.4S, v21.4S, v8.S[0] // ...........................e.................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + + // original source code + // ldr q9, [x0, #0] // ...........................................................................................*.............................................................................................. + // ldr q10, [x0, #(1*(1024/8))] // ...........................................................................................|*............................................................................................. + // ldr q11, [x0, #(2*(1024/8))] // e..........................................................................................|....e......................................................................................... + // ldr q12, [x0, #(3*(1024/8))] // ...e.......................................................................................|.......e...................................................................................... + // ldr q13, [x0, #(4*(1024/8))] // ......e....................................................................................|..........e................................................................................... + // ldr q14, [x0, #(5*(1024/8))] // ........e..................................................................................|............e................................................................................. + // ldr q15, [x0, #(6*(1024/8))] // ..e........................................................................................|......e....................................................................................... + // ldr q16, [x0, #(7*(1024/8))] // ....e......................................................................................|........e..................................................................................... + // sub v24.4s, v9.4s, v10.4s // .......*...................................................................................|...........*.................................................................................. + // add v9.4s, v9.4s, v10.4s // .........*.................................................................................|.............*................................................................................ + // mul v10.4s, v24.4s, v1.s[2] // ................*..........................................................................|....................*......................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................*.....................................................................|.........................*.................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ..............................*............................................................|..................................*........................................................... + // sub v24.4s, v11.4s, v12.4s // ...........................................................................................|.*............................................................................................ + // add v11.4s, v11.4s, v12.4s // ...........................................................................................|...*.......................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // .....*.....................................................................................|.........*.................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........*................................................................................|..............*............................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................*.......................................................................|.......................*...................................................................... + // sub v24.4s, v13.4s, v14.4s // ..........................e................................................................|..............................e............................................................... + // add v13.4s, v13.4s, v14.4s // .*.........................................................................................|.....*........................................................................................ + // mul v14.4s, v24.4s, v2.s[2] // ...........................................................................................|..*........................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ........................................................................................e..|............................................................................................e. + // mls v14.4s, v24.4s, v8.s[0] // ............*..............................................................................|................*............................................................................. + // sub v24.4s, v15.4s, v16.4s // .............e.............................................................................|.................e............................................................................ + // add v15.4s, v15.4s, v16.4s // ..................e........................................................................|......................e....................................................................... + // mul v16.4s, v24.4s, v3.s[0] // .......................................................................................e...|...........................................................................................e.. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ....................................................................................e......|........................................................................................e..... + // mls v16.4s, v24.4s, v8.s[0] // ..........................................................................................e|.............................................................................................. + // sub v24.4s, v9.4s, v11.4s // ..............*............................................................................|..................*........................................................................... + // add v9.4s, v9.4s, v11.4s // ...............*...........................................................................|...................*.......................................................................... + // mul v11.4s, v24.4s, v0.s[2] // .........................*.................................................................|.............................*................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................*...............................................................|...............................*.............................................................. + // mls v11.4s, v24.4s, v8.s[0] // ...............................*...........................................................|...................................*.......................................................... + // sub v24.4s, v10.4s, v12.4s // .................................*.........................................................|.....................................*........................................................ + // add v10.4s, v10.4s, v12.4s // ..................................*........................................................|......................................*....................................................... + // mul v12.4s, v24.4s, v0.s[2] // ....................................*......................................................|........................................*..................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................*...................................................|...........................................*.................................................. + // mls v12.4s, v24.4s, v8.s[0] // ............................................*..............................................|................................................*............................................. + // sub v24.4s, v13.4s, v15.4s // ...........*...............................................................................|...............*.............................................................................. + // add v13.4s, v13.4s, v15.4s // .................*.........................................................................|.....................*........................................................................ + // mul v15.4s, v24.4s, v1.s[0] // ............................*..............................................................|................................*............................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................*.............................................................|.................................*............................................................ + // mls v15.4s, v24.4s, v8.s[0] // ................................*..........................................................|....................................*......................................................... + // sub v24.4s, v14.4s, v16.4s // ....................*......................................................................|........................*..................................................................... + // add v14.4s, v14.4s, v16.4s // ......................*....................................................................|..........................*................................................................... + // mul v16.4s, v24.4s, v1.s[0] // ...................................*.......................................................|.......................................*...................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................*.................................................|.............................................*................................................ + // mls v16.4s, v24.4s, v8.s[0] // .............................................*.............................................|.................................................*............................................ + // sub v24.4s, v9.4s, v13.4s // .......................*...................................................................|...........................*.................................................................. + // add v9.4s, v9.4s, v13.4s // ........................*..................................................................|............................*................................................................. + // mul v13.4s, v24.4s, v0.s[0] // ...........................................*...............................................|...............................................*.............................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................*............................................|..................................................*........................................... + // mls v13.4s, v24.4s, v8.s[0] // ....................................................*......................................|........................................................*..................................... + // sub v24.4s, v10.4s, v14.4s // .....................................*.....................................................|.........................................*.................................................... + // add v10.4s, v10.4s, v14.4s // ......................................*....................................................|..........................................*................................................... + // mul v14.4s, v24.4s, v0.s[0] // ...............................................*...........................................|...................................................*.......................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................*.....................................|.........................................................*.................................... + // mls v14.4s, v24.4s, v8.s[0] // .........................................................*.................................|.............................................................*................................ + // sub v24.4s, v11.4s, v15.4s // ........................................*..................................................|............................................*................................................. + // add v11.4s, v11.4s, v15.4s // ..........................................*................................................|..............................................*............................................... + // mul v15.4s, v24.4s, v0.s[0] // ......................................................*....................................|..........................................................*................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................*..................................|............................................................*................................. + // mls v15.4s, v24.4s, v8.s[0] // ...........................................................*...............................|...............................................................*.............................. + // sub v24.4s, v12.4s, v16.4s // ................................................*..........................................|....................................................*......................................... + // add v12.4s, v12.4s, v16.4s // .................................................*.........................................|.....................................................*........................................ + // mul v16.4s, v24.4s, v0.s[0] // ..........................................................*................................|..............................................................*............................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................*.............................|.................................................................*............................ + // mls v16.4s, v24.4s, v8.s[0] // .................................................................*.........................|.....................................................................*........................ + // srshr v24.4S, v13.4S, #23 // .......................................................*...................................|...........................................................*.................................. + // mls v13.4s, v24.4s, v8.4s // ..............................................................*............................|..................................................................*........................... + // srshr v24.4S, v14.4S, #23 // ............................................................*..............................|................................................................*............................. + // mls v14.4s, v24.4s, v8.4s // ................................................................*..........................|....................................................................*......................... + // srshr v24.4S, v15.4S, #23 // ...............................................................*...........................|...................................................................*.......................... + // mls v15.4s, v24.4s, v8.4s // ...................................................................*.......................|.......................................................................*...................... + // srshr v24.4S, v16.4S, #23 // ......................................................................*....................|..........................................................................*................... + // mls v16.4s, v24.4s, v8.4s // .........................................................................*.................|.............................................................................*................ + // str q13, [x0, #(4*(1024/8))] // ..................................................................*........................|......................................................................*....................... + // str q14, [x0, #(5*(1024/8))] // ....................................................................*......................|........................................................................*..................... + // str q15, [x0, #(6*(1024/8))] // ........................................................................*..................|............................................................................*................. + // str q16, [x0, #(7*(1024/8))] // .............................................................................*.............|.................................................................................*............ + // mul v13.4s, v9.4s, v25.4s // ..................................................*........................................|......................................................*....................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ...................................................*.......................................|.......................................................*...................................... + // mls v13.4s, v9.4s, v8.s[0] // .....................................................................*.....................|.........................................................................*.................... + // mul v14.4s, v10.4s, v25.4s // .......................................................................*...................|...........................................................................*.................. + // sqrdmulh v10.4s, v10.4s, v26.4s // ...........................................................................*...............|...............................................................................*.............. + // mls v14.4s, v10.4s, v8.s[0] // ...............................................................................*...........|...................................................................................*.......... + // mul v15.4s, v11.4s, v25.4s // ............................................................................*..............|................................................................................*............. + // sqrdmulh v11.4s, v11.4s, v26.4s // ..............................................................................*............|..................................................................................*........... + // mls v15.4s, v11.4s, v8.s[0] // ...................................................................................*.......|.......................................................................................*...... + // mul v16.4s, v12.4s, v25.4s // ................................................................................*..........|....................................................................................*......... + // sqrdmulh v12.4s, v12.4s, v26.4s // .................................................................................*.........|.....................................................................................*........ + // mls v16.4s, v12.4s, v8.s[0] // .....................................................................................*.....|.........................................................................................*.... + // str q13, [x0], #(16) // ..........................................................................*................|..............................................................................*............... + // str q14, [x0, #(-16 + 1*(1024/8))] // ..................................................................................*........|......................................................................................*....... + // str q15, [x0, #(-16 + 2*(1024/8))] // ......................................................................................*....|..........................................................................................*... + // str q16, [x0, #(-16 + 3*(1024/8))] // .........................................................................................*.|.............................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + sub v28.4S, v6.4S, v29.4S // ..*................................................................................ + mul v17.4S, v30.4S, v2.S[2] // ...*............................................................................... + ldr q21, [x0, #0] // *.................................................................................. + add v14.4S, v6.4S, v29.4S // ....*.............................................................................. + ldr q24, [x0, #128] // .*................................................................................. + // gap // ................................................................................... + add v6.4S, v9.4S, v7.4S // .....*............................................................................. + mls v17.4S, v18.4S, v8.S[0] // ...........*....................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v31.4S, v28.4S, v2.S[0] // ......*............................................................................ + // gap // ................................................................................... + // gap // ................................................................................... + sub v15.4S, v6.4S, v23.4S // ..........*........................................................................ + // gap // ................................................................................... + // gap // ................................................................................... + add v6.4S, v6.4S, v23.4S // ...............*................................................................... + sqrdmulh v28.4S, v28.4S, v2.S[1] // .........*......................................................................... + // gap // ................................................................................... + sub v29.4S, v17.4S, v10.4S // .................*................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + add v17.4S, v17.4S, v10.4S // ...................*............................................................... + mul v13.4S, v15.4S, v1.S[0] // ........................*.......................................................... + // gap // ................................................................................... + add v16.4S, v21.4S, v24.4S // ........*.......................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v21.4S, v21.4S, v24.4S // .......*........................................................................... + sqrdmulh v24.4S, v15.4S, v1.S[1] // .........................*......................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v31.4S, v28.4S, v8.S[0] // ................*.................................................................. + sub v28.4S, v16.4S, v14.4S // ............*...................................................................... + // gap // ................................................................................... + add v14.4S, v16.4S, v14.4S // .............*..................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v15.4S, v21.4S, v1.S[2] // ..............*.................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v21.4S, v21.4S, v1.S[3] // ..................*................................................................ + sub v16.4S, v14.4S, v6.4S // ....................*.............................................................. + // gap // ................................................................................... + add v14.4S, v14.4S, v6.4S // .....................*............................................................. + // gap // ................................................................................... + // gap // ................................................................................... + mul v6.4S, v28.4S, v0.S[2] // ......................*............................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v28.4S, v28.4S, v0.S[3] // .......................*........................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v9.4S, v29.4S, v1.S[0] // ...............................*................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v29.4S, v29.4S, v1.S[1] // .....................................*............................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v15.4S, v21.4S, v8.S[0] // ..........................*........................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v13.4S, v24.4S, v8.S[0] // ............................*...................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v6.4S, v28.4S, v8.S[0] // ...........................*....................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v21.4S, v15.4S, v31.4S // ..............................*.................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v24.4S, v15.4S, v31.4S // .............................*..................................................... + mul v28.4S, v16.4S, v0.S[0] // .......................................*........................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v9.4S, v29.4S, v8.S[0] // .........................................*......................................... + sub v31.4S, v21.4S, v17.4S // .................................*................................................. + // gap // ................................................................................... + add v17.4S, v21.4S, v17.4S // ..................................*................................................ + // gap // ................................................................................... + // gap // ................................................................................... + mul v21.4S, v24.4S, v0.S[2] // ................................*.................................................. + sub v15.4S, v6.4S, v13.4S // ....................................*.............................................. + // gap // ................................................................................... + add v6.4S, v6.4S, v13.4S // ......................................*............................................ + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v24.4S, v24.4S, v0.S[3] // ...................................*............................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v29.4S, v16.4S, v0.S[1] // ..........................................*........................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v13.4S, v31.4S, v0.S[0] // ...........................................*....................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v21.4S, v24.4S, v8.S[0] // ........................................*.......................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v24.4S, v14.4S, v25.4S // ..............................................*.................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v14.4S, v14.4S, v26.4S // ...............................................*................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v16.4S, v21.4S, v9.4S // ............................................*...................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v28.4S, v29.4S, v8.S[0] // ................................................*.................................. + add v21.4S, v21.4S, v9.4S // .............................................*..................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v31.4S, v31.4S, v0.S[1] // .................................................*................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v29.4S, v15.4S, v0.S[0] // ..................................................*................................ + // gap // ................................................................................... + // gap // ................................................................................... + srshr v9.4S, v28.4S, #23 // ...................................................*............................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v15.4S, v15.4S, v0.S[1] // ....................................................*.............................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v13.4S, v31.4S, v8.S[0] // .....................................................*............................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v31.4S, v16.4S, v0.S[0] // ......................................................*............................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v29.4S, v15.4S, v8.S[0] // .......................................................*........................... + // gap // ................................................................................... + // gap // ................................................................................... + srshr v15.4S, v13.4S, #23 // ........................................................*.......................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v16.4S, v16.4S, v0.S[1] // .........................................................*......................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v28.4S, v9.4S, v8.4S // ..........................................................*........................ + // gap // ................................................................................... + // gap // ................................................................................... + srshr v9.4S, v29.4S, #23 // ...........................................................*....................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v13.4S, v15.4S, v8.4S // ............................................................*...................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v31.4S, v16.4S, v8.S[0] // .............................................................*..................... + // gap // ................................................................................... + // gap // ................................................................................... + str q28, [x0, #512] // ..............................................................*.................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v29.4S, v9.4S, v8.4S // ...............................................................*................... + // gap // ................................................................................... + // gap // ................................................................................... + str q13, [x0, #640] // ................................................................*.................. + // gap // ................................................................................... + // gap // ................................................................................... + mls v24.4S, v14.4S, v8.S[0] // .................................................................*................. + // gap // ................................................................................... + // gap // ................................................................................... + srshr v14.4S, v31.4S, #23 // ..................................................................*................ + // gap // ................................................................................... + // gap // ................................................................................... + mul v28.4S, v17.4S, v25.4S // ...................................................................*............... + // gap // ................................................................................... + // gap // ................................................................................... + str q29, [x0, #768] // ....................................................................*.............. + // gap // ................................................................................... + // gap // ................................................................................... + mls v31.4S, v14.4S, v8.4S // .....................................................................*............. + // gap // ................................................................................... + // gap // ................................................................................... + str q24, [x0], #(16) // ......................................................................*............ + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v17.4S, v17.4S, v26.4S // .......................................................................*........... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v14.4S, v6.4S, v25.4S // ........................................................................*.......... + // gap // ................................................................................... + // gap // ................................................................................... + str q31, [x0, #880] // .........................................................................*......... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v24.4S, v6.4S, v26.4S // ..........................................................................*........ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v28.4S, v17.4S, v8.S[0] // ...........................................................................*....... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v17.4S, v21.4S, v26.4S // .............................................................................*..... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v21.4S, v21.4S, v25.4S // ............................................................................*...... + // gap // ................................................................................... + // gap // ................................................................................... + str q28, [x0, #112] // ..............................................................................*.... + // gap // ................................................................................... + // gap // ................................................................................... + mls v14.4S, v24.4S, v8.S[0] // ...............................................................................*... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v21.4S, v17.4S, v8.S[0] // ................................................................................*.. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q14, [x0, #240] // .................................................................................*. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q21, [x0, #368] // ..................................................................................* + // gap // ................................................................................... + // gap // ................................................................................... + + // original source code + // ldr q17, [x0, #0] // ..*................................................................................ + // ldr q21, [x0, #128] // ....*.............................................................................. + // sub v14.4S, v6.4S, v29.4S // *.................................................................................. + // mul v24.4S, v30.4S, v2.S[2] // .*................................................................................. + // add v28.4S, v6.4S, v29.4S // ...*............................................................................... + // add v31.4S, v9.4S, v7.4S // .....*............................................................................. + // mul v16.4S, v14.4S, v2.S[0] // .......*........................................................................... + // sub v30.4S, v17.4S, v21.4S // ...............*................................................................... + // add v17.4S, v17.4S, v21.4S // ..............*.................................................................... + // sqrdmulh v21.4S, v14.4S, v2.S[1] // ..........*........................................................................ + // sub v14.4S, v31.4S, v23.4S // ........*.......................................................................... + // mls v24.4S, v18.4S, v8.S[0] // ......*............................................................................ + // sub v12.4S, v17.4S, v28.4S // ..................*................................................................ + // add v17.4S, v17.4S, v28.4S // ...................*............................................................... + // mul v28.4S, v30.4S, v1.S[2] // ....................*.............................................................. + // add v31.4S, v31.4S, v23.4S // .........*......................................................................... + // mls v16.4S, v21.4S, v8.S[0] // .................*................................................................. + // sub v21.4S, v24.4S, v10.4S // ...........*....................................................................... + // sqrdmulh v15.4S, v30.4S, v1.S[3] // .....................*............................................................. + // add v24.4S, v24.4S, v10.4S // ............*...................................................................... + // sub v13.4S, v17.4S, v31.4S // ......................*............................................................ + // add v17.4S, v17.4S, v31.4S // .......................*........................................................... + // mul v31.4S, v12.4S, v0.S[2] // ........................*.......................................................... + // sqrdmulh v12.4S, v12.4S, v0.S[3] // .........................*......................................................... + // mul v10.4S, v14.4S, v1.S[0] // .............*..................................................................... + // sqrdmulh v14.4S, v14.4S, v1.S[1] // ................*.................................................................. + // mls v28.4S, v15.4S, v8.S[0] // ............................*...................................................... + // mls v31.4S, v12.4S, v8.S[0] // ..............................*.................................................... + // mls v10.4S, v14.4S, v8.S[0] // .............................*..................................................... + // sub v14.4S, v28.4S, v16.4S // ................................*.................................................. + // add v28.4S, v28.4S, v16.4S // ...............................*................................................... + // mul v15.4S, v21.4S, v1.S[0] // ..........................*........................................................ + // mul v16.4S, v14.4S, v0.S[2] // .....................................*............................................. + // sub v12.4S, v28.4S, v24.4S // ...................................*............................................... + // add v24.4S, v28.4S, v24.4S // ....................................*.............................................. + // sqrdmulh v14.4S, v14.4S, v0.S[3] // ........................................*.......................................... + // sub v28.4S, v31.4S, v10.4S // ......................................*............................................ + // sqrdmulh v21.4S, v21.4S, v1.S[1] // ...........................*....................................................... + // add v31.4S, v31.4S, v10.4S // .......................................*........................................... + // mul v10.4S, v13.4S, v0.S[0] // .................................*................................................. + // mls v16.4S, v14.4S, v8.S[0] // ...........................................*....................................... + // mls v15.4S, v21.4S, v8.S[0] // ..................................*................................................ + // sqrdmulh v21.4S, v13.4S, v0.S[1] // .........................................*......................................... + // mul v14.4S, v12.4S, v0.S[0] // ..........................................*........................................ + // sub v13.4S, v16.4S, v15.4S // ..............................................*.................................... + // add v15.4S, v16.4S, v15.4S // ................................................*.................................. + // mul v16.4S, v17.4S, v25.4S // ............................................*...................................... + // sqrdmulh v17.4S, v17.4S, v26.4S // .............................................*..................................... + // mls v10.4S, v21.4S, v8.S[0] // ...............................................*................................... + // sqrdmulh v21.4S, v12.4S, v0.S[1] // .................................................*................................. + // mul v12.4S, v28.4S, v0.S[0] // ..................................................*................................ + // srshr v4.4S, v10.4S, #23 // ...................................................*............................... + // sqrdmulh v28.4S, v28.4S, v0.S[1] // ....................................................*.............................. + // mls v14.4S, v21.4S, v8.S[0] // .....................................................*............................. + // mul v21.4S, v13.4S, v0.S[0] // ......................................................*............................ + // mls v12.4S, v28.4S, v8.S[0] // .......................................................*........................... + // srshr v28.4S, v14.4S, #23 // ........................................................*.......................... + // sqrdmulh v13.4S, v13.4S, v0.S[1] // .........................................................*......................... + // mls v10.4S, v4.4S, v8.4S // ..........................................................*........................ + // srshr v4.4S, v12.4S, #23 // ...........................................................*....................... + // mls v14.4S, v28.4S, v8.4S // ............................................................*...................... + // mls v21.4S, v13.4S, v8.S[0] // .............................................................*..................... + // str q10, [x0, #512] // ..............................................................*.................... + // mls v12.4S, v4.4S, v8.4S // ...............................................................*................... + // str q14, [x0, #640] // ................................................................*.................. + // mls v16.4S, v17.4S, v8.S[0] // .................................................................*................. + // srshr v17.4S, v21.4S, #23 // ..................................................................*................ + // mul v14.4S, v24.4S, v25.4S // ...................................................................*............... + // str q12, [x0, #768] // ....................................................................*.............. + // mls v21.4S, v17.4S, v8.4S // .....................................................................*............. + // str q16, [x0], #(16) // ......................................................................*............ + // sqrdmulh v17.4S, v24.4S, v26.4S // .......................................................................*........... + // mul v24.4S, v31.4S, v25.4S // ........................................................................*.......... + // str q21, [x0, #880] // .........................................................................*......... + // sqrdmulh v21.4S, v31.4S, v26.4S // ..........................................................................*........ + // mls v14.4S, v17.4S, v8.S[0] // ...........................................................................*....... + // mul v17.4S, v15.4S, v25.4S // .............................................................................*..... + // sqrdmulh v28.4S, v15.4S, v26.4S // ............................................................................*...... + // str q14, [x0, #112] // ..............................................................................*.... + // mls v24.4S, v21.4S, v8.S[0] // ...............................................................................*... + // mls v17.4S, v28.4S, v8.S[0] // ................................................................................*.. + // str q24, [x0, #240] // .................................................................................*. + // str q17, [x0, #368] // ..................................................................................* + + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s new file mode 100644 index 0000000..9d7b69c --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s @@ -0,0 +1,2292 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm + .global _intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm: +_intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + ldr q30, [x5, #16] // *..... + ldr q18, [x5, #128] // .*.... + ldr q0, [x1, #48] // ..*... + // gap // ...... + // gap // ...... + // gap // ...... + // gap // ...... + // gap // ...... + ldr q20, [x1, #32] // ...*.. + ldr q1, [x1, #16] // ....*. + ldr q27, [x1, #0] // .....* + // gap // ...... + // gap // ...... + // gap // ...... + // gap // ...... + // gap // ...... + + // original source code + // ldr q30, [x5, #16] // *..... + // ldr q18, [x5, #128] // .*.... + // ldr q0, [x1, #48] // ..*... + // ldr q20, [x1, #32] // ...*.. + // ldr q1, [x1, #16] // ....*. + // ldr q27, [x1, #0] // .....* + + sub count, count, #1 +layer45678_start: + // gap // ...................................................................................................................................................................... + ldr q13, [x2, #48] // ...............*...................................................................................................................................................... + ldr q19, [x2, #16] // .............*........................................................................................................................................................ + trn1 v25.4S, v20.4S, v0.4S // ......*............................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q5, [x2, #0] // ............*......................................................................................................................................................... + ldr q31, [x2, #32] // ..............*....................................................................................................................................................... + trn1 v7.4S, v27.4S, v1.4S // ....*................................................................................................................................................................. + trn2 v27.4S, v27.4S, v1.4S // .....*................................................................................................................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q1, [x5, #176] // .......................................................*.............................................................................................................. + trn2 v14.4S, v20.4S, v0.4S // .......*.............................................................................................................................................................. + ldr q12, [x5, #144] // .....................................................*................................................................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn2 v10.2D, v7.2D, v25.2D // ........*............................................................................................................................................................. + trn1 v29.2D, v7.2D, v25.2D // ..........*........................................................................................................................................................... + ldr q11, [x5, #80] // .............................*........................................................................................................................................ + ldr q9, [x5, #32] // ..........................*........................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q28, [x5, #64] // ............................*......................................................................................................................................... + ldr q22, [x5, #48] // ...........................*.......................................................................................................................................... + trn2 v24.2D, v27.2D, v14.2D // .........*............................................................................................................................................................ + trn1 v21.2D, v27.2D, v14.2D // ...........*.......................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v14.4S, v31.4S, v13.4S // ..................*................................................................................................................................................... + trn2 v15.4S, v31.4S, v13.4S // ...................*.................................................................................................................................................. + trn2 v13.4S, v5.4S, v19.4S // .................*.................................................................................................................................................... + trn1 v3.4S, v5.4S, v19.4S // ................*..................................................................................................................................................... + ldr q5, [x5, #160] // ......................................................*............................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v2.4S, v29.4S, v21.4S // ..............................*....................................................................................................................................... + sub v16.4S, v10.4S, v24.4S // ...................................*.................................................................................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v0.2D, v3.2D, v14.2D // ......................*............................................................................................................................................... + trn2 v23.2D, v3.2D, v14.2D // ....................*................................................................................................................................................. + trn2 v6.2D, v13.2D, v15.2D // .....................*................................................................................................................................................ + trn1 v26.2D, v13.2D, v15.2D // .......................*.............................................................................................................................................. + ldr q31, [x5, #112] // ...................................................*.................................................................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v27.4S, v16.4S, v28.4S // .....................................*................................................................................................................................ + mul v14.4S, v2.4S, v9.4S // ................................*..................................................................................................................................... + sqrdmulh v13.4S, v2.4S, v22.4S // .................................*.................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + add v15.4S, v0.4S, v26.4S // .........................................................*............................................................................................................ + add v9.4S, v29.4S, v21.4S // ...............................*...................................................................................................................................... + sub v19.4S, v0.4S, v26.4S // ........................................................*............................................................................................................. + ldr q20, [x5], #(12*16) // ........................*............................................................................................................................................. + sub v28.4S, v23.4S, v6.4S // .............................................................*........................................................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v25.4S, v16.4S, v11.4S // ......................................*............................................................................................................................... + add v2.4S, v10.4S, v24.4S // ....................................*................................................................................................................................. + add v16.4S, v23.4S, v6.4S // ..............................................................*....................................................................................................... + ldr q10, [x4, #48] // ...............................................................................................*...................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v21.4S, v28.4S, v5.4S // ...............................................................*...................................................................................................... + sqrdmulh v3.4S, v28.4S, v1.4S // ................................................................*..................................................................................................... + sqrdmulh v4.4S, v19.4S, v12.4S // ...........................................................*.......................................................................................................... + mul v22.4S, v19.4S, v18.4S // ..........................................................*........................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + add v5.4S, v15.4S, v16.4S // ...................................................................*.................................................................................................. + sub v15.4S, v15.4S, v16.4S // ..................................................................*................................................................................................... + ldr q26, [x5, #-96] // ..................................................*................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v27.4S, v25.4S, v8.S[0] // .......................................*.............................................................................................................................. + mls v14.4S, v13.4S, v8.S[0] // ..................................*................................................................................................................................... + sub v12.4S, v9.4S, v2.4S // ........................................*............................................................................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v22.4S, v4.4S, v8.S[0] // ............................................................*......................................................................................................... + mls v21.4S, v3.4S, v8.S[0] // .................................................................*.................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + add v28.4S, v9.4S, v2.4S // .........................................*............................................................................................................................ + mul v19.4S, v12.4S, v20.4S // ..........................................*........................................................................................................................... + sqrdmulh v11.4S, v12.4S, v30.4S // ...........................................*.......................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + add v0.4S, v14.4S, v27.4S // ..............................................*....................................................................................................................... + sub v6.4S, v14.4S, v27.4S // .............................................*........................................................................................................................ + sqrdmulh v14.4S, v15.4S, v31.4S // .....................................................................*................................................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v29.4S, v22.4S, v21.4S // .......................................................................*.............................................................................................. + add v1.4S, v22.4S, v21.4S // ........................................................................*............................................................................................. + mul v9.4S, v15.4S, v26.4S // ....................................................................*................................................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v24.4S, v28.4S, v0.4S // ............................................................................*......................................................................................... + mul v25.4S, v6.4S, v20.4S // ...............................................*...................................................................................................................... + sqrdmulh v17.4S, v6.4S, v30.4S // ................................................*..................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v19.4S, v11.4S, v8.S[0] // ............................................*......................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn2 v22.4S, v5.4S, v1.4S // .....................................................................................*................................................................................ + mul v4.4S, v29.4S, v26.4S // .........................................................................*............................................................................................ + sqrdmulh v26.4S, v29.4S, v31.4S // ..........................................................................*........................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn2 v21.4S, v28.4S, v0.4S // .............................................................................*........................................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v25.4S, v17.4S, v8.S[0] // .................................................*.................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v4.4S, v26.4S, v8.S[0] // ...........................................................................*.......................................................................................... + // gap // ...................................................................................................................................................................... + mls v9.4S, v14.4S, v8.S[0] // ......................................................................*............................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v7.4S, v5.4S, v1.4S // ....................................................................................*................................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v30.4S, v19.4S, v25.4S // ..............................................................................*....................................................................................... + trn2 v11.4S, v19.4S, v25.4S // ...............................................................................*...................................................................................... + ldr q25, [x4, #16] // .............................................................................................*........................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn2 v27.4S, v9.4S, v4.4S // .......................................................................................*.............................................................................. + trn1 v0.4S, v9.4S, v4.4S // ......................................................................................*............................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn2 v14.2D, v24.2D, v30.2D // ................................................................................*..................................................................................... + trn2 v28.2D, v21.2D, v11.2D // .................................................................................*.................................................................................... + trn1 v29.2D, v24.2D, v30.2D // ..................................................................................*................................................................................... + trn1 v17.2D, v21.2D, v11.2D // ...................................................................................*.................................................................................. + ldr q11, [x4, #32] // ..............................................................................................*....................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + trn1 v1.2D, v7.2D, v0.2D // ..........................................................................................*........................................................................... + trn1 v12.2D, v22.2D, v27.2D // ...........................................................................................*.......................................................................... + trn2 v30.2D, v7.2D, v0.2D // ........................................................................................*............................................................................. + trn2 v13.2D, v22.2D, v27.2D // .........................................................................................*............................................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v31.4S, v14.4S, v28.4S // .....................................................................................................*................................................................ + sub v20.4S, v29.4S, v17.4S // ................................................................................................*..................................................................... + add v3.4S, v14.4S, v28.4S // ......................................................................................................*............................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v18.4S, v1.4S, v12.4S // ..........................................................................................................*........................................................... + sub v9.4S, v30.4S, v13.4S // ...............................................................................................................*...................................................... + add v0.4S, v1.4S, v12.4S // ...........................................................................................................*.......................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v22.4S, v31.4S, v11.S[0] // .......................................................................................................*.............................................................. + sqrdmulh v4.4S, v31.4S, v11.S[1] // ........................................................................................................*............................................................. + mul v6.4S, v20.4S, v25.S[2] // ..................................................................................................*................................................................... + sqrdmulh v27.4S, v20.4S, v25.S[3] // ...................................................................................................*.................................................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v16.4S, v18.4S, v11.S[2] // ............................................................................................................*......................................................... + mul v26.4S, v9.4S, v10.S[0] // .................................................................................................................*.................................................... + sqrdmulh v20.4S, v18.4S, v11.S[3] // .............................................................................................................*........................................................ + sqrdmulh v18.4S, v9.4S, v10.S[1] // ..................................................................................................................*................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + add v14.4S, v29.4S, v17.4S // .................................................................................................*.................................................................... + add v10.4S, v30.4S, v13.4S // ................................................................................................................*..................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v22.4S, v4.4S, v8.S[0] // .........................................................................................................*............................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v30.4S, v14.4S, v3.4S // ....................................................................................................................*................................................. + mls v6.4S, v27.4S, v8.S[0] // ....................................................................................................*................................................................. + mls v26.4S, v18.4S, v8.S[0] // ...................................................................................................................*.................................................. + mls v16.4S, v20.4S, v8.S[0] // ..............................................................................................................*....................................................... + ldr q18, [x4], #64 // ............................................................................................*......................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v13.4S, v0.4S, v10.4S // ..............................................................................................................................*....................................... + // gap // ...................................................................................................................................................................... + add v3.4S, v14.4S, v3.4S // .....................................................................................................................*................................................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + add v31.4S, v0.4S, v10.4S // ...............................................................................................................................*...................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v17.4S, v16.4S, v26.4S // ...................................................................................................................................*.................................. + sub v29.4S, v6.4S, v22.4S // .........................................................................................................................*............................................ + mul v9.4S, v13.4S, v25.S[0] // ................................................................................................................................*..................................... + sqrdmulh v14.4S, v13.4S, v25.S[1] // .................................................................................................................................*.................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v10.4S, v30.4S, v18.S[2] // ......................................................................................................................*............................................... + sqrdmulh v19.4S, v30.4S, v18.S[3] // .......................................................................................................................*.............................................. + ldr q30, [x5, #16] // .........................e............................................................................................................................................ + sub v1.4S, v3.4S, v31.4S // ........................................................................................................................................*............................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sqrdmulh v0.4S, v17.4S, v25.S[1] // ......................................................................................................................................*............................... + mul v25.4S, v17.4S, v25.S[0] // .....................................................................................................................................*................................ + mul v27.4S, v29.4S, v18.S[2] // ...........................................................................................................................*.......................................... + sqrdmulh v20.4S, v29.4S, v18.S[3] // ............................................................................................................................*......................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v9.4S, v14.4S, v8.S[0] // ..................................................................................................................................*................................... + add v3.4S, v3.4S, v31.4S // .........................................................................................................................................*............................ + mul v4.4S, v1.4S, v18.S[0] // ..........................................................................................................................................*........................... + add v7.4S, v6.4S, v22.4S // ..........................................................................................................................*........................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v10.4S, v19.4S, v8.S[0] // ........................................................................................................................*............................................. + sqrdmulh v24.4S, v1.4S, v18.S[1] // ...........................................................................................................................................*.......................... + add v19.4S, v16.4S, v26.4S // ....................................................................................................................................*................................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v25.4S, v0.4S, v8.S[0] // .......................................................................................................................................*.............................. + mls v27.4S, v20.4S, v8.S[0] // .............................................................................................................................*........................................ + str q3, [x1], #(16*4) // ............................................................................................................................................................*......... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + add v2.4S, v7.4S, v19.4S // ..............................................................................................................................................*....................... + sub v13.4S, v7.4S, v19.4S // .............................................................................................................................................*........................ + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v5.4S, v10.4S, v9.4S // ..................................................................................................................................................*................... + add v19.4S, v10.4S, v9.4S // ...................................................................................................................................................*.................. + mls v4.4S, v24.4S, v8.S[0] // ............................................................................................................................................*......................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + sub v9.4S, v27.4S, v25.4S // .......................................................................................................................................................*.............. + add v31.4S, v27.4S, v25.4S // ........................................................................................................................................................*............. + str q2, [x1, #-48] // .............................................................................................................................................................*........ + sqrdmulh v14.4S, v13.4S, v18.S[1] // ................................................................................................................................................*..................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mul v17.4S, v13.4S, v18.S[0] // ...............................................................................................................................................*...................... + str q19, [x1, #-32] // ..............................................................................................................................................................*....... + mul v19.4S, v5.4S, v18.S[0] // ....................................................................................................................................................*................. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + str q31, [x1, #-16] // ...............................................................................................................................................................*...... + sqrdmulh v31.4S, v5.4S, v18.S[1] // .....................................................................................................................................................*................ + mul v28.4S, v9.4S, v18.S[0] // .........................................................................................................................................................*............ + sqrdmulh v5.4S, v9.4S, v18.S[1] // ..........................................................................................................................................................*........... + ldr q18, [x5, #128] // ....................................................e................................................................................................................. + add x1, x1, #64 // ....................................................................................................................................................................*. + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + str q4, [x2], #(16*4) // ................................................................................................................................................................*..... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q0, [x1, #48] // ...e.................................................................................................................................................................. + mls v17.4S, v14.4S, v8.S[0] // .................................................................................................................................................*.................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + mls v19.4S, v31.4S, v8.S[0] // ......................................................................................................................................................*............... + mls v28.4S, v5.4S, v8.S[0] // ...........................................................................................................................................................*.......... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + ldr q20, [x1, #32] // ..e................................................................................................................................................................... + ldr q1, [x1, #16] // .e.................................................................................................................................................................... + ldr q27, [x1, #0] // e..................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + str q17, [x2, #-48] // .................................................................................................................................................................*.... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + str q19, [x2, #-32] // ..................................................................................................................................................................*... + str q28, [x2, #-16] // ...................................................................................................................................................................*.. + add x2, x2, #64 // .....................................................................................................................................................................* + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + // gap // ...................................................................................................................................................................... + + // original source code + // ldr q9, [x1, #0] // .........................................e....|................................................................................................................................................................e.... + // ldr q10, [x1, #16] // ........................................e.....|...............................................................................................................................................................e..... + // ldr q11, [x1, #32] // .......................................e......|..............................................................................................................................................................e...... + // ldr q12, [x1, #48] // ...................................e..........|..........................................................................................................................................................e.......... + // trn1 v25.4s, v9.4s, v10.4s // ..............................................|....*................................................................................................................................................................ + // trn2 v26.4s, v9.4s, v10.4s // ..............................................|.....*............................................................................................................................................................... + // trn1 v27.4s, v11.4s, v12.4s // ..............................................|.*................................................................................................................................................................... + // trn2 v28.4s, v11.4s, v12.4s // ..............................................|.......*............................................................................................................................................................. + // trn2 v11.2d, v25.2d, v27.2d // ..............................................|.........*........................................................................................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // ..............................................|...............*..................................................................................................................................................... + // trn1 v9.2d, v25.2d, v27.2d // ..............................................|..........*.......................................................................................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ..............................................|................*.................................................................................................................................................... + // ldr q13, [x2, #0] // ..............................................|..*.................................................................................................................................................................. + // ldr q14, [x2, #16] // ..............................................|*.................................................................................................................................................................... + // ldr q15, [x2, #32] // ..............................................|...*................................................................................................................................................................. + // ldr q16, [x2, #48] // ..............................................*..................................................................................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ..............................................|....................*................................................................................................................................................ + // trn2 v26.4s, v13.4s, v14.4s // ..............................................|...................*................................................................................................................................................. + // trn1 v27.4s, v15.4s, v16.4s // ..............................................|.................*................................................................................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // ..............................................|..................*.................................................................................................................................................. + // trn2 v15.2d, v25.2d, v27.2d // ..............................................|.........................*........................................................................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ..............................................|..........................*.......................................................................................................................................... + // trn1 v13.2d, v25.2d, v27.2d // ..............................................|........................*............................................................................................................................................ + // trn1 v14.2d, v26.2d, v28.2d // ..............................................|...........................*......................................................................................................................................... + // ldr q0, [x5], #(12*16) // ..............................................|...................................*................................................................................................................................. + // ldr q4, [x5, #(-12*16 + 1*16)] // e.............................................|.......................................................................................................................e............................................. + // ldr q1, [x5, #(-12*16 + 2*16)] // ..............................................|............*........................................................................................................................................................ + // ldr q5, [x5, #(-12*16 + 3*16)] // ..............................................|..............*...................................................................................................................................................... + // ldr q2, [x5, #(-12*16 + 4*16)] // ..............................................|.............*....................................................................................................................................................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ..............................................|...........*......................................................................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ..............................................|......................*.............................................................................................................................................. + // add v9.4s, v9.4s, v10.4s // ..............................................|.................................*................................................................................................................................... + // mul v10.4s, v24.4s, v1.4s // ..............................................|..............................*...................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ..............................................|...............................*..................................................................................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ..............................................|.................................................*................................................................................................................... + // sub v24.4s, v11.4s, v12.4s // ..............................................|.......................*............................................................................................................................................. + // add v11.4s, v11.4s, v12.4s // ..............................................|......................................*.............................................................................................................................. + // mul v12.4s, v24.4s, v2.4s // ..............................................|.............................*....................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ..............................................|.....................................*............................................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ..............................................|................................................*.................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ..............................................|..................................................*.................................................................................................................. + // add v9.4s, v9.4s, v11.4s // ..............................................|.....................................................*............................................................................................................... + // mul v11.4s, v24.4s, v0.4s // ..............................................|......................................................*.............................................................................................................. + // sqrdmulh v24.4s, v24.4s, v4.4s // ..............................................|.......................................................*............................................................................................................. + // mls v11.4s, v24.4s, v8.s[0] // ..............................................|.................................................................*................................................................................................... + // sub v24.4s, v10.4s, v12.4s // ..............................................|.........................................................*........................................................................................................... + // add v10.4s, v10.4s, v12.4s // ..............................................|........................................................*............................................................................................................ + // mul v12.4s, v24.4s, v0.4s // ..............................................|...............................................................*..................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..............................................|................................................................*.................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ..............................................|......................................................................*.............................................................................................. + // ldr q0, [x5, #(-12*16 + 6*16)] // ..............................................|...............................................*..................................................................................................................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ..............................................|............................*........................................................................................................................................ + // ldr q1, [x5, #(-12*16 + 8*16)] // ................................e.............|.......................................................................................................................................................e............. + // ldr q5, [x5, #(-12*16 + 9*16)] // ..............................................|........*............................................................................................................................................................ + // ldr q2, [x5, #(-12*16 + 10*16)] // ..............................................|.....................*............................................................................................................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ..............................................|......*.............................................................................................................................................................. + // sub v24.4s, v13.4s, v14.4s // ..............................................|..................................*.................................................................................................................................. + // add v13.4s, v13.4s, v14.4s // ..............................................|................................*.................................................................................................................................... + // mul v14.4s, v24.4s, v1.4s // ..............................................|............................................*........................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v5.4s // ..............................................|...........................................*......................................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ..............................................|...................................................*................................................................................................................. + // sub v24.4s, v15.4s, v16.4s // ..............................................|....................................*................................................................................................................................ + // add v15.4s, v15.4s, v16.4s // ..............................................|.......................................*............................................................................................................................. + // mul v16.4s, v24.4s, v2.4s // ..............................................|.........................................*........................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ..............................................|..........................................*.......................................................................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ..............................................|....................................................*................................................................................................................ + // sub v24.4s, v13.4s, v15.4s // ..............................................|..............................................*...................................................................................................................... + // add v13.4s, v13.4s, v15.4s // ..............................................|.............................................*....................................................................................................................... + // mul v15.4s, v24.4s, v0.4s // ..............................................|.............................................................*....................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..............................................|..........................................................*.......................................................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ..............................................|........................................................................*............................................................................................ + // sub v24.4s, v14.4s, v16.4s // ..............................................|...........................................................*......................................................................................................... + // add v14.4s, v14.4s, v16.4s // ..............................................|............................................................*........................................................................................................ + // mul v16.4s, v24.4s, v0.4s // ..............................................|...................................................................*................................................................................................. + // sqrdmulh v24.4s, v24.4s, v4.4s // ..............................................|....................................................................*................................................................................................ + // mls v16.4s, v24.4s, v8.s[0] // ..............................................|.......................................................................*............................................................................................. + // trn1 v25.4s, v9.4s, v10.4s // ..............................................|..............................................................*...................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ..............................................|.....................................................................*............................................................................................... + // trn1 v27.4s, v11.4s, v12.4s // ..............................................|..........................................................................*.......................................................................................... + // trn2 v28.4s, v11.4s, v12.4s // ..............................................|...........................................................................*......................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // ..............................................|...............................................................................*..................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // ..............................................|................................................................................*.................................................................................... + // trn1 v9.2d, v25.2d, v27.2d // ..............................................|.................................................................................*................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ..............................................|..................................................................................*.................................................................................. + // trn1 v25.4s, v13.4s, v14.4s // ..............................................|.........................................................................*........................................................................................... + // trn2 v26.4s, v13.4s, v14.4s // ..............................................|..................................................................*.................................................................................................. + // trn1 v27.4s, v15.4s, v16.4s // ..............................................|..............................................................................*...................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // ..............................................|.............................................................................*....................................................................................... + // trn2 v15.2d, v25.2d, v27.2d // ..............................................|......................................................................................*.............................................................................. + // trn2 v16.2d, v26.2d, v28.2d // ..............................................|.......................................................................................*............................................................................. + // trn1 v13.2d, v25.2d, v27.2d // ..............................................|....................................................................................*................................................................................ + // trn1 v14.2d, v26.2d, v28.2d // ..............................................|.....................................................................................*............................................................................... + // ldr q0, [x4], #64 // ..............................................|.............................................................................................................*....................................................... + // ldr q1, [x4, #(-64 + 16)] // ..............................................|............................................................................*........................................................................................ + // ldr q2, [x4, #(-64 + 32)] // ..............................................|...................................................................................*................................................................................. + // ldr q3, [x4, #(-64 + 48)] // ..............................................|........................................*............................................................................................................................ + // sub v24.4s, v9.4s, v10.4s // ..............................................|.........................................................................................*........................................................................... + // add v9.4s, v9.4s, v10.4s // ..............................................|......................................................................................................*.............................................................. + // mul v10.4s, v24.4s, v1.s[2] // ..............................................|................................................................................................*.................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..............................................|.................................................................................................*................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ..............................................|..........................................................................................................*.......................................................... + // sub v24.4s, v11.4s, v12.4s // ..............................................|........................................................................................*............................................................................ + // add v11.4s, v11.4s, v12.4s // ..............................................|..........................................................................................*.......................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ..............................................|..............................................................................................*...................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..............................................|...............................................................................................*..................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ..............................................|........................................................................................................*............................................................ + // sub v24.4s, v13.4s, v14.4s // ..............................................|...........................................................................................*......................................................................... + // add v13.4s, v13.4s, v14.4s // ..............................................|.............................................................................................*....................................................................... + // mul v14.4s, v24.4s, v2.s[2] // ..............................................|..................................................................................................*.................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..............................................|....................................................................................................*................................................................ + // mls v14.4s, v24.4s, v8.s[0] // ..............................................|............................................................................................................*........................................................ + // sub v24.4s, v15.4s, v16.4s // ..............................................|............................................................................................*........................................................................ + // add v15.4s, v15.4s, v16.4s // ..............................................|.......................................................................................................*............................................................. + // mul v16.4s, v24.4s, v3.s[0] // ..............................................|...................................................................................................*................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ..............................................|.....................................................................................................*............................................................... + // mls v16.4s, v24.4s, v8.s[0] // ..............................................|...........................................................................................................*......................................................... + // sub v24.4s, v9.4s, v11.4s // ..............................................|.........................................................................................................*........................................................... + // add v9.4s, v9.4s, v11.4s // ..............................................|...............................................................................................................*..................................................... + // mul v11.4s, v24.4s, v0.s[2] // ..............................................|.....................................................................................................................*............................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..............................................|......................................................................................................................*.............................................. + // mls v11.4s, v24.4s, v8.s[0] // ..........*...................................|.................................................................................................................................*................................... + // sub v24.4s, v10.4s, v12.4s // ..............................................|..................................................................................................................*.................................................. + // add v10.4s, v10.4s, v12.4s // .........*....................................|................................................................................................................................*.................................... + // mul v12.4s, v24.4s, v0.s[2] // ....*.........................................|...........................................................................................................................*......................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....*........................................|............................................................................................................................*........................................ + // mls v12.4s, v24.4s, v8.s[0] // ..............*...............................|.....................................................................................................................................*............................... + // sub v24.4s, v13.4s, v15.4s // ..............................................|..............................................................................................................*...................................................... + // add v13.4s, v13.4s, v15.4s // ..............................................|................................................................................................................*.................................................... + // mul v15.4s, v24.4s, v1.s[0] // ..............................................|...................................................................................................................*................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................|....................................................................................................................*................................................ + // mls v15.4s, v24.4s, v8.s[0] // ......*.......................................|.............................................................................................................................*....................................... + // sub v24.4s, v14.4s, v16.4s // ..............................................|.................................................................................................................*................................................... + // add v14.4s, v14.4s, v16.4s // ............*.................................|...................................................................................................................................*................................. + // mul v16.4s, v24.4s, v1.s[0] // ...*..........................................|..........................................................................................................................*.......................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..*...........................................|.........................................................................................................................*........................................... + // mls v16.4s, v24.4s, v8.s[0] // .............*................................|....................................................................................................................................*................................ + // sub v24.4s, v9.4s, v13.4s // .*............................................|........................................................................................................................*............................................ + // add v9.4s, v9.4s, v13.4s // .......*......................................|..............................................................................................................................*...................................... + // mul v13.4s, v24.4s, v0.s[0] // ........*.....................................|...............................................................................................................................*..................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........*..................................|..................................................................................................................................*.................................. + // mls v13.4s, v24.4s, v8.s[0] // ....................*.........................|...........................................................................................................................................*......................... + // sub v24.4s, v10.4s, v14.4s // .................*............................|........................................................................................................................................*............................ + // add v10.4s, v10.4s, v14.4s // ................*.............................|.......................................................................................................................................*............................. + // mul v14.4s, v24.4s, v0.s[0] // .........................*....................|................................................................................................................................................*.................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................*.....................|...............................................................................................................................................*..................... + // mls v14.4s, v24.4s, v8.s[0] // ....................................*.........|...........................................................................................................................................................*......... + // sub v24.4s, v11.4s, v15.4s // ..................*...........................|.........................................................................................................................................*........................... + // add v11.4s, v11.4s, v15.4s // ...................*..........................|..........................................................................................................................................*.......................... + // mul v15.4s, v24.4s, v0.s[0] // ...........................*..................|..................................................................................................................................................*.................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................*................|....................................................................................................................................................*................ + // mls v15.4s, v24.4s, v8.s[0] // .....................................*........|............................................................................................................................................................*........ + // sub v24.4s, v12.4s, v16.4s // .....................*........................|............................................................................................................................................*........................ + // add v12.4s, v12.4s, v16.4s // ......................*.......................|.............................................................................................................................................*....................... + // mul v16.4s, v24.4s, v0.s[0] // ..............................*...............|.....................................................................................................................................................*............... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................*..............|......................................................................................................................................................*.............. + // mls v16.4s, v24.4s, v8.s[0] // ......................................*.......|.............................................................................................................................................................*....... + // str q9, [x1], #(16*4) // ...............*..............................|......................................................................................................................................*.............................. + // str q10, [x1, #(-16*4 + 1*16)] // .......................*......................|..............................................................................................................................................*...................... + // str q11, [x1, #(-16*4 + 2*16)] // ..........................*...................|.................................................................................................................................................*................... + // str q12, [x1, #(-16*4 + 3*16)] // ............................*.................|...................................................................................................................................................*................. + // str q13, [x2], #(16*4) // ..................................*...........|.........................................................................................................................................................*........... + // str q14, [x2, #(-16*4 + 1*16)] // ..........................................*...|.................................................................................................................................................................*... + // str q15, [x2, #(-16*4 + 2*16)] // ...........................................*..|..................................................................................................................................................................*.. + // str q16, [x2, #(-16*4 + 3*16)] // ............................................*.|...................................................................................................................................................................*. + // add x1, x1, #64 // .................................*............|........................................................................................................................................................*............ + // add x2, x2, #64 // .............................................*|....................................................................................................................................................................* + + sub count, count, #1 + cbnz count, layer45678_start + trn1 v10.4S, v20.4S, v0.4S // ..*............................................................................................................................................................. + trn2 v22.4S, v20.4S, v0.4S // ........*....................................................................................................................................................... + trn1 v26.4S, v27.4S, v1.4S // .....*.......................................................................................................................................................... + trn2 v7.4S, v27.4S, v1.4S // ......*......................................................................................................................................................... + ldr q27, [x2, #16] // .*.............................................................................................................................................................. + ldr q13, [x2, #0] // ...*............................................................................................................................................................ + ldr q15, [x2, #48] // *............................................................................................................................................................... + // gap // ................................................................................................................................................................ + ldr q24, [x2, #32] // ....*........................................................................................................................................................... + ldr q6, [x5, #80] // ............*................................................................................................................................................... + ldr q11, [x5, #176] // .......*........................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + trn2 v23.2D, v26.2D, v10.2D // ..........*..................................................................................................................................................... + trn1 v10.2D, v26.2D, v10.2D // ...........*.................................................................................................................................................... + trn2 v26.2D, v7.2D, v22.2D // ................*............................................................................................................................................... + trn1 v22.2D, v7.2D, v22.2D // .................*.............................................................................................................................................. + ldr q7, [x5, #32] // .............*.................................................................................................................................................. + ldr q12, [x5, #64] // ..............*................................................................................................................................................. + ldr q31, [x5, #48] // ...............*................................................................................................................................................ + // gap // ................................................................................................................................................................ + ldr q5, [x5, #144] // .........*...................................................................................................................................................... + ldr q9, [x5, #160] // ......................*......................................................................................................................................... + ldr q16, [x5, #112] // .............................*.................................................................................................................................. + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + trn2 v20.4S, v13.4S, v27.4S // ....................*........................................................................................................................................... + trn1 v27.4S, v13.4S, v27.4S // .....................*.......................................................................................................................................... + sub v13.4S, v23.4S, v26.4S // ........................*....................................................................................................................................... + sub v4.4S, v10.4S, v22.4S // .......................*........................................................................................................................................ + ldr q17, [x5], #(12*16) // ....................................*........................................................................................................................... + ldr q1, [x5, #-96] // ................................................*............................................................................................................... + ldr q19, [x4, #48] // .........................................*...................................................................................................................... + // gap // ................................................................................................................................................................ + trn2 v21.4S, v24.4S, v15.4S // ...................*............................................................................................................................................ + trn1 v15.4S, v24.4S, v15.4S // ..................*............................................................................................................................................. + add v26.4S, v23.4S, v26.4S // .......................................*........................................................................................................................ + add v10.4S, v10.4S, v22.4S // ..................................*............................................................................................................................. + ldr q22, [x4, #32] // ....................................................................................*........................................................................... + ldr q24, [x4, #16] // .............................................................................*.................................................................................. + ldr q23, [x4], #64 // ..............................................................................................................*................................................. + // gap // ................................................................................................................................................................ + sqrdmulh v6.4S, v13.4S, v6.4S // ......................................*......................................................................................................................... + mul v7.4S, v4.4S, v7.4S // ...............................*................................................................................................................................ + mul v13.4S, v13.4S, v12.4S // ..............................*................................................................................................................................. + sqrdmulh v12.4S, v4.4S, v31.4S // ................................*............................................................................................................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + trn2 v31.2D, v20.2D, v21.2D // ...........................*.................................................................................................................................... + trn1 v20.2D, v20.2D, v21.2D // ............................*................................................................................................................................... + trn2 v4.2D, v27.2D, v15.2D // ..........................*..................................................................................................................................... + trn1 v27.2D, v27.2D, v15.2D // .........................*...................................................................................................................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + sub v15.4S, v10.4S, v26.4S // ...................................................*............................................................................................................ + add v10.4S, v10.4S, v26.4S // ......................................................*......................................................................................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + mls v13.4S, v6.4S, v8.S[0] // .................................................*.............................................................................................................. + mls v7.4S, v12.4S, v8.S[0] // ..................................................*............................................................................................................. + sub v26.4S, v27.4S, v20.4S // ...................................*............................................................................................................................ + sub v6.4S, v4.4S, v31.4S // .....................................*.......................................................................................................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + add v12.4S, v4.4S, v31.4S // ........................................*....................................................................................................................... + add v27.4S, v27.4S, v20.4S // .................................*.............................................................................................................................. + mul v31.4S, v15.4S, v17.4S // .......................................................*........................................................................................................ + sqrdmulh v15.4S, v15.4S, v30.4S // ........................................................*....................................................................................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + mul v18.4S, v26.4S, v18.4S // .............................................*.................................................................................................................. + sqrdmulh v11.4S, v6.4S, v11.4S // ...........................................*.................................................................................................................... + sqrdmulh v26.4S, v26.4S, v5.4S // ............................................*................................................................................................................... + mul v6.4S, v6.4S, v9.4S // ..........................................*..................................................................................................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + sub v5.4S, v7.4S, v13.4S // ..........................................................*..................................................................................................... + add v7.4S, v7.4S, v13.4S // .........................................................*...................................................................................................... + sub v13.4S, v27.4S, v12.4S // ...............................................*................................................................................................................ + add v27.4S, v27.4S, v12.4S // ..............................................*................................................................................................................. + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + mls v31.4S, v15.4S, v8.S[0] // ..................................................................*............................................................................................. + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + sqrdmulh v30.4S, v5.4S, v30.4S // .................................................................*.............................................................................................. + mls v18.4S, v26.4S, v8.S[0] // ....................................................*........................................................................................................... + mls v6.4S, v11.4S, v8.S[0] // .....................................................*.......................................................................................................... + sqrdmulh v26.4S, v13.4S, v16.4S // ...........................................................*.................................................................................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + mul v15.4S, v5.4S, v17.4S // ................................................................*............................................................................................... + trn1 v11.4S, v10.4S, v7.4S // ...............................................................*................................................................................................ + trn2 v10.4S, v10.4S, v7.4S // ......................................................................*......................................................................................... + mul v7.4S, v13.4S, v1.4S // ..............................................................*................................................................................................. + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + sub v13.4S, v18.4S, v6.4S // ............................................................*................................................................................................... + add v18.4S, v18.4S, v6.4S // .............................................................*.................................................................................................. + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + mls v15.4S, v30.4S, v8.S[0] // .......................................................................*........................................................................................ + mls v7.4S, v26.4S, v8.S[0] // .........................................................................*...................................................................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + sqrdmulh v26.4S, v13.4S, v16.4S // .....................................................................*.......................................................................................... + mul v30.4S, v13.4S, v1.4S // ....................................................................*........................................................................................... + trn2 v13.4S, v27.4S, v18.4S // ...................................................................*............................................................................................ + trn1 v18.4S, v27.4S, v18.4S // ..........................................................................*..................................................................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + trn1 v27.4S, v31.4S, v15.4S // ...........................................................................*.................................................................................... + trn2 v15.4S, v31.4S, v15.4S // ............................................................................*................................................................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + mls v30.4S, v26.4S, v8.S[0] // ........................................................................*....................................................................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + trn2 v26.2D, v11.2D, v27.2D // ................................................................................*............................................................................... + trn1 v27.2D, v11.2D, v27.2D // ..................................................................................*............................................................................. + trn2 v6.2D, v10.2D, v15.2D // .................................................................................*.............................................................................. + trn1 v10.2D, v10.2D, v15.2D // ...................................................................................*............................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + trn2 v15.4S, v7.4S, v30.4S // ..............................................................................*................................................................................. + trn1 v7.4S, v7.4S, v30.4S // ...............................................................................*................................................................................ + sub v30.4S, v26.4S, v6.4S // .........................................................................................*...................................................................... + sub v11.4S, v27.4S, v10.4S // ..........................................................................................*..................................................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + add v26.4S, v26.4S, v6.4S // ...........................................................................................*.................................................................... + add v10.4S, v27.4S, v10.4S // .......................................................................................................*........................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + trn1 v27.2D, v13.2D, v15.2D // ......................................................................................*......................................................................... + trn2 v13.2D, v13.2D, v15.2D // ........................................................................................*....................................................................... + trn1 v15.2D, v18.2D, v7.2D // .....................................................................................*.......................................................................... + trn2 v18.2D, v18.2D, v7.2D // .......................................................................................*........................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + mul v7.4S, v30.4S, v22.S[0] // ...............................................................................................*................................................................ + sqrdmulh v30.4S, v30.4S, v22.S[1] // ................................................................................................*............................................................... + mul v6.4S, v11.4S, v24.S[2] // .................................................................................................*.............................................................. + sqrdmulh v11.4S, v11.4S, v24.S[3] // ..................................................................................................*............................................................. + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + sub v12.4S, v15.4S, v27.4S // ............................................................................................*................................................................... + add v27.4S, v15.4S, v27.4S // ..............................................................................................*................................................................. + sub v15.4S, v18.4S, v13.4S // .............................................................................................*.................................................................. + add v18.4S, v18.4S, v13.4S // ........................................................................................................*....................................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + sub v13.4S, v10.4S, v26.4S // ..........................................................................................................*..................................................... + add v10.4S, v10.4S, v26.4S // ................................................................................................................*............................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + mul v26.4S, v15.4S, v19.S[0] // ....................................................................................................*........................................................... + sqrdmulh v15.4S, v15.4S, v19.S[1] // ......................................................................................................*......................................................... + mul v31.4S, v12.4S, v22.S[2] // ...................................................................................................*............................................................ + sqrdmulh v22.4S, v12.4S, v22.S[3] // .....................................................................................................*.......................................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + mls v7.4S, v30.4S, v8.S[0] // .........................................................................................................*...................................................... + mls v6.4S, v11.4S, v8.S[0] // ...........................................................................................................*.................................................... + sub v30.4S, v27.4S, v18.4S // ...............................................................................................................*................................................ + add v18.4S, v27.4S, v18.4S // .................................................................................................................*.............................................. + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + mul v11.4S, v13.4S, v23.S[2] // ......................................................................................................................*......................................... + sqrdmulh v27.4S, v13.4S, v23.S[3] // .......................................................................................................................*........................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + mls v26.4S, v15.4S, v8.S[0] // ............................................................................................................*................................................... + mls v31.4S, v22.4S, v8.S[0] // .............................................................................................................*.................................................. + mul v22.4S, v30.4S, v24.S[0] // ....................................................................................................................*........................................... + sqrdmulh v30.4S, v30.4S, v24.S[1] // .....................................................................................................................*.......................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + add v13.4S, v10.4S, v18.4S // ..............................................................................................................................*................................. + sub v10.4S, v10.4S, v18.4S // ........................................................................................................................*....................................... + sub v18.4S, v6.4S, v7.4S // ...................................................................................................................*............................................ + add v7.4S, v6.4S, v7.4S // ................................................................................................................................*............................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + mls v11.4S, v27.4S, v8.S[0] // .................................................................................................................................*.............................. + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + mls v22.4S, v30.4S, v8.S[0] // .............................................................................................................................*.................................. + str q13, [x1], #(16*4) // ......................................................................................................................................*......................... + mul v13.4S, v18.4S, v23.S[2] // ...........................................................................................................................*.................................... + sqrdmulh v18.4S, v18.4S, v23.S[3] // ............................................................................................................................*................................... + sub v30.4S, v31.4S, v26.4S // ..................................................................................................................*............................................. + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + add v26.4S, v31.4S, v26.4S // ...................................................................................................................................*............................ + mul v15.4S, v10.4S, v23.S[0] // ...............................................................................................................................*................................ + sqrdmulh v10.4S, v10.4S, v23.S[1] // ..................................................................................................................................*............................. + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + sqrdmulh v6.4S, v30.4S, v24.S[1] // .........................................................................................................................*...................................... + mul v27.4S, v30.4S, v24.S[0] // ..........................................................................................................................*..................................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + add v30.4S, v7.4S, v26.4S // .......................................................................................................................................*........................ + sub v26.4S, v7.4S, v26.4S // ........................................................................................................................................*....................... + mls v13.4S, v18.4S, v8.S[0] // .....................................................................................................................................*.......................... + sub v18.4S, v11.4S, v22.4S // .........................................................................................................................................*...................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + add v22.4S, v11.4S, v22.4S // ..........................................................................................................................................*..................... + mls v15.4S, v10.4S, v8.S[0] // ...........................................................................................................................................*.................... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + mls v27.4S, v6.4S, v8.S[0] // ....................................................................................................................................*........................... + str q30, [x1, #-48] // ..............................................................................................................................................*................. + sqrdmulh v10.4S, v26.4S, v23.S[1] // ...............................................................................................................................................*................ + mul v26.4S, v26.4S, v23.S[0] // ................................................................................................................................................*............... + mul v7.4S, v18.4S, v23.S[0] // ..................................................................................................................................................*............. + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + sqrdmulh v18.4S, v18.4S, v23.S[1] // ....................................................................................................................................................*........... + str q22, [x1, #-32] // .................................................................................................................................................*.............. + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + str q15, [x2], #(16*4) // ........................................................................................................................................................*....... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + sub v30.4S, v13.4S, v27.4S // ............................................................................................................................................*................... + add v22.4S, v13.4S, v27.4S // .............................................................................................................................................*.................. + mls v26.4S, v10.4S, v8.S[0] // .........................................................................................................................................................*...... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + mls v7.4S, v18.4S, v8.S[0] // ..........................................................................................................................................................*..... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + str q22, [x1, #-16] // ...................................................................................................................................................*............ + add x1, x1, #64 // .......................................................................................................................................................*........ + mul v10.4S, v30.4S, v23.S[0] // .....................................................................................................................................................*.......... + sqrdmulh v18.4S, v30.4S, v23.S[1] // ......................................................................................................................................................*......... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + str q26, [x2, #-48] // ............................................................................................................................................................*... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + str q7, [x2, #-32] // .............................................................................................................................................................*.. + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + mls v10.4S, v18.4S, v8.S[0] // ...........................................................................................................................................................*.... + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + str q10, [x2, #-16] // ..............................................................................................................................................................*. + add x2, x2, #64 // ...............................................................................................................................................................* + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + // gap // ................................................................................................................................................................ + + // original source code + // ldr q13, [x2, #48] // ......*......................................................................................................................................................... + // ldr q19, [x2, #16] // ....*........................................................................................................................................................... + // trn1 v25.4S, v20.4S, v0.4S // *............................................................................................................................................................... + // ldr q5, [x2, #0] // .....*.......................................................................................................................................................... + // ldr q31, [x2, #32] // .......*........................................................................................................................................................ + // trn1 v7.4S, v27.4S, v1.4S // ..*............................................................................................................................................................. + // trn2 v27.4S, v27.4S, v1.4S // ...*............................................................................................................................................................ + // ldr q1, [x5, #176] // .........*...................................................................................................................................................... + // trn2 v14.4S, v20.4S, v0.4S // .*.............................................................................................................................................................. + // ldr q12, [x5, #144] // .................*.............................................................................................................................................. + // trn2 v10.2D, v7.2D, v25.2D // ..........*..................................................................................................................................................... + // trn1 v29.2D, v7.2D, v25.2D // ...........*.................................................................................................................................................... + // ldr q11, [x5, #80] // ........*....................................................................................................................................................... + // ldr q9, [x5, #32] // ..............*................................................................................................................................................. + // ldr q28, [x5, #64] // ...............*................................................................................................................................................ + // ldr q22, [x5, #48] // ................*............................................................................................................................................... + // trn2 v24.2D, v27.2D, v14.2D // ............*................................................................................................................................................... + // trn1 v21.2D, v27.2D, v14.2D // .............*.................................................................................................................................................. + // trn1 v14.4S, v31.4S, v13.4S // ............................*................................................................................................................................... + // trn2 v15.4S, v31.4S, v13.4S // ...........................*.................................................................................................................................... + // trn2 v13.4S, v5.4S, v19.4S // ....................*........................................................................................................................................... + // trn1 v3.4S, v5.4S, v19.4S // .....................*.......................................................................................................................................... + // ldr q5, [x5, #160] // ..................*............................................................................................................................................. + // sub v2.4S, v29.4S, v21.4S // .......................*........................................................................................................................................ + // sub v16.4S, v10.4S, v24.4S // ......................*......................................................................................................................................... + // trn1 v0.2D, v3.2D, v14.2D // .........................................*...................................................................................................................... + // trn2 v23.2D, v3.2D, v14.2D // ........................................*....................................................................................................................... + // trn2 v6.2D, v13.2D, v15.2D // ......................................*......................................................................................................................... + // trn1 v26.2D, v13.2D, v15.2D // .......................................*........................................................................................................................ + // ldr q31, [x5, #112] // ...................*............................................................................................................................................ + // mul v27.4S, v16.4S, v28.4S // ....................................*........................................................................................................................... + // mul v14.4S, v2.4S, v9.4S // ...................................*............................................................................................................................ + // sqrdmulh v13.4S, v2.4S, v22.4S // .....................................*.......................................................................................................................... + // add v15.4S, v0.4S, v26.4S // .................................................*.............................................................................................................. + // add v9.4S, v29.4S, v21.4S // ..............................*................................................................................................................................. + // sub v19.4S, v0.4S, v26.4S // ..............................................*................................................................................................................. + // ldr q20, [x5], #(12*16) // ........................*....................................................................................................................................... + // sub v28.4S, v23.4S, v6.4S // ...............................................*................................................................................................................ + // sqrdmulh v25.4S, v16.4S, v11.4S // ..................................*............................................................................................................................. + // add v2.4S, v10.4S, v24.4S // .............................*.................................................................................................................................. + // add v16.4S, v23.4S, v6.4S // ................................................*............................................................................................................... + // ldr q10, [x4, #48] // ..........................*..................................................................................................................................... + // mul v21.4S, v28.4S, v5.4S // .......................................................*........................................................................................................ + // sqrdmulh v3.4S, v28.4S, v1.4S // .....................................................*.......................................................................................................... + // sqrdmulh v4.4S, v19.4S, v12.4S // ......................................................*......................................................................................................... + // mul v22.4S, v19.4S, v18.4S // ....................................................*........................................................................................................... + // add v5.4S, v15.4S, v16.4S // ...........................................................*.................................................................................................... + // sub v15.4S, v15.4S, v16.4S // ..........................................................*..................................................................................................... + // ldr q26, [x5, #-96] // .........................*...................................................................................................................................... + // mls v27.4S, v25.4S, v8.S[0] // ............................................*................................................................................................................... + // mls v14.4S, v13.4S, v8.S[0] // .............................................*.................................................................................................................. + // sub v12.4S, v9.4S, v2.4S // ..........................................*..................................................................................................................... + // mls v22.4S, v4.4S, v8.S[0] // ..............................................................*................................................................................................. + // mls v21.4S, v3.4S, v8.S[0] // ...............................................................*................................................................................................ + // add v28.4S, v9.4S, v2.4S // ...........................................*.................................................................................................................... + // mul v19.4S, v12.4S, v20.4S // ..................................................*............................................................................................................. + // sqrdmulh v11.4S, v12.4S, v30.4S // ...................................................*............................................................................................................ + // add v0.4S, v14.4S, v27.4S // .........................................................*...................................................................................................... + // sub v6.4S, v14.4S, v27.4S // ........................................................*....................................................................................................... + // sqrdmulh v14.4S, v15.4S, v31.4S // ................................................................*............................................................................................... + // sub v29.4S, v22.4S, v21.4S // .....................................................................*.......................................................................................... + // add v1.4S, v22.4S, v21.4S // ......................................................................*......................................................................................... + // mul v9.4S, v15.4S, v26.4S // ....................................................................*........................................................................................... + // trn1 v24.4S, v28.4S, v0.4S // ..................................................................*............................................................................................. + // mul v25.4S, v6.4S, v20.4S // .................................................................*.............................................................................................. + // sqrdmulh v17.4S, v6.4S, v30.4S // .............................................................*.................................................................................................. + // mls v19.4S, v11.4S, v8.S[0] // ............................................................*................................................................................................... + // trn2 v22.4S, v5.4S, v1.4S // ...........................................................................*.................................................................................... + // mul v4.4S, v29.4S, v26.4S // ..........................................................................*..................................................................................... + // sqrdmulh v26.4S, v29.4S, v31.4S // .........................................................................*...................................................................................... + // trn2 v21.4S, v28.4S, v0.4S // ...................................................................*............................................................................................ + // mls v25.4S, v17.4S, v8.S[0] // .......................................................................*........................................................................................ + // mls v4.4S, v26.4S, v8.S[0] // ...............................................................................*................................................................................ + // mls v9.4S, v14.4S, v8.S[0] // ........................................................................*....................................................................................... + // trn1 v7.4S, v5.4S, v1.4S // ............................................................................*................................................................................... + // trn1 v30.4S, v19.4S, v25.4S // .............................................................................*.................................................................................. + // trn2 v11.4S, v19.4S, v25.4S // ..............................................................................*................................................................................. + // ldr q25, [x4, #16] // ................................*............................................................................................................................... + // trn2 v27.4S, v9.4S, v4.4S // ....................................................................................*........................................................................... + // trn1 v0.4S, v9.4S, v4.4S // .....................................................................................*.......................................................................... + // trn2 v14.2D, v24.2D, v30.2D // ................................................................................*............................................................................... + // trn2 v28.2D, v21.2D, v11.2D // ..................................................................................*............................................................................. + // trn1 v29.2D, v24.2D, v30.2D // .................................................................................*.............................................................................. + // trn1 v17.2D, v21.2D, v11.2D // ...................................................................................*............................................................................ + // ldr q11, [x4, #32] // ...............................*................................................................................................................................ + // trn1 v1.2D, v7.2D, v0.2D // ............................................................................................*................................................................... + // trn1 v12.2D, v22.2D, v27.2D // ..........................................................................................*..................................................................... + // trn2 v30.2D, v7.2D, v0.2D // .............................................................................................*.................................................................. + // trn2 v13.2D, v22.2D, v27.2D // ...........................................................................................*.................................................................... + // sub v31.4S, v14.4S, v28.4S // ......................................................................................*......................................................................... + // sub v20.4S, v29.4S, v17.4S // .......................................................................................*........................................................................ + // add v3.4S, v14.4S, v28.4S // ........................................................................................*....................................................................... + // sub v18.4S, v1.4S, v12.4S // ..................................................................................................*............................................................. + // sub v9.4S, v30.4S, v13.4S // ....................................................................................................*........................................................... + // add v0.4S, v1.4S, v12.4S // ...................................................................................................*............................................................ + // mul v22.4S, v31.4S, v11.S[0] // ..............................................................................................*................................................................. + // sqrdmulh v4.4S, v31.4S, v11.S[1] // ...............................................................................................*................................................................ + // mul v6.4S, v20.4S, v25.S[2] // ................................................................................................*............................................................... + // sqrdmulh v27.4S, v20.4S, v25.S[3] // .................................................................................................*.............................................................. + // mul v16.4S, v18.4S, v11.S[2] // ..........................................................................................................*..................................................... + // mul v26.4S, v9.4S, v10.S[0] // ........................................................................................................*....................................................... + // sqrdmulh v20.4S, v18.4S, v11.S[3] // ...........................................................................................................*.................................................... + // sqrdmulh v18.4S, v9.4S, v10.S[1] // .........................................................................................................*...................................................... + // add v14.4S, v29.4S, v17.4S // .........................................................................................*...................................................................... + // add v10.4S, v30.4S, v13.4S // .....................................................................................................*.......................................................... + // mls v22.4S, v4.4S, v8.S[0] // ............................................................................................................*................................................... + // sub v30.4S, v14.4S, v3.4S // ......................................................................................................*......................................................... + // mls v6.4S, v27.4S, v8.S[0] // .............................................................................................................*.................................................. + // mls v26.4S, v18.4S, v8.S[0] // ..................................................................................................................*............................................. + // mls v16.4S, v20.4S, v8.S[0] // ...................................................................................................................*............................................ + // ldr q18, [x4], #64 // .................................*.............................................................................................................................. + // sub v13.4S, v0.4S, v10.4S // ..............................................................................................................*................................................. + // add v3.4S, v14.4S, v3.4S // .......................................................................................................*........................................................ + // add v31.4S, v0.4S, v10.4S // ...............................................................................................................*................................................ + // sub v17.4S, v16.4S, v26.4S // ...............................................................................................................................*................................ + // sub v29.4S, v6.4S, v22.4S // ........................................................................................................................*....................................... + // mul v9.4S, v13.4S, v25.S[0] // ....................................................................................................................*........................................... + // sqrdmulh v14.4S, v13.4S, v25.S[1] // .....................................................................................................................*.......................................... + // mul v10.4S, v30.4S, v18.S[2] // ................................................................................................................*............................................... + // sqrdmulh v19.4S, v30.4S, v18.S[3] // .................................................................................................................*.............................................. + // sub v1.4S, v3.4S, v31.4S // .......................................................................................................................*........................................ + // sqrdmulh v0.4S, v17.4S, v25.S[1] // ...................................................................................................................................*............................ + // mul v25.4S, v17.4S, v25.S[0] // ....................................................................................................................................*........................... + // mul v27.4S, v29.4S, v18.S[2] // .............................................................................................................................*.................................. + // sqrdmulh v20.4S, v29.4S, v18.S[3] // ..............................................................................................................................*................................. + // mls v9.4S, v14.4S, v8.S[0] // ...........................................................................................................................*.................................... + // add v3.4S, v3.4S, v31.4S // ......................................................................................................................*......................................... + // mul v4.4S, v1.4S, v18.S[0] // .................................................................................................................................*.............................. + // add v7.4S, v6.4S, v22.4S // .........................................................................................................................*...................................... + // mls v10.4S, v19.4S, v8.S[0] // ..........................................................................................................................*..................................... + // sqrdmulh v24.4S, v1.4S, v18.S[1] // ..................................................................................................................................*............................. + // add v19.4S, v16.4S, v26.4S // ................................................................................................................................*............................... + // mls v25.4S, v0.4S, v8.S[0] // ...........................................................................................................................................*.................... + // mls v27.4S, v20.4S, v8.S[0] // .......................................................................................................................................*........................ + // str q3, [x1], #(16*4) // ............................................................................................................................*................................... + // add v2.4S, v7.4S, v19.4S // .....................................................................................................................................*.......................... + // sub v13.4S, v7.4S, v19.4S // ......................................................................................................................................*......................... + // sub v5.4S, v10.4S, v9.4S // ........................................................................................................................................*....................... + // add v19.4S, v10.4S, v9.4S // .........................................................................................................................................*...................... + // mls v4.4S, v24.4S, v8.S[0] // ..........................................................................................................................................*..................... + // sub v9.4S, v27.4S, v25.4S // ...................................................................................................................................................*............ + // add v31.4S, v27.4S, v25.4S // ....................................................................................................................................................*........... + // str q2, [x1, #-48] // ............................................................................................................................................*................... + // sqrdmulh v14.4S, v13.4S, v18.S[1] // .............................................................................................................................................*.................. + // mul v17.4S, v13.4S, v18.S[0] // ..............................................................................................................................................*................. + // str q19, [x1, #-32] // .................................................................................................................................................*.............. + // mul v19.4S, v5.4S, v18.S[0] // ...............................................................................................................................................*................ + // str q31, [x1, #-16] // .......................................................................................................................................................*........ + // sqrdmulh v31.4S, v5.4S, v18.S[1] // ................................................................................................................................................*............... + // mul v28.4S, v9.4S, v18.S[0] // .........................................................................................................................................................*...... + // sqrdmulh v5.4S, v9.4S, v18.S[1] // ..........................................................................................................................................................*..... + // add x1, x1, #64 // ........................................................................................................................................................*....... + // str q4, [x2], #(16*4) // ..................................................................................................................................................*............. + // mls v17.4S, v14.4S, v8.S[0] // .....................................................................................................................................................*.......... + // mls v19.4S, v31.4S, v8.S[0] // ......................................................................................................................................................*......... + // mls v28.4S, v5.4S, v8.S[0] // .............................................................................................................................................................*.. + // str q17, [x2, #-48] // ...........................................................................................................................................................*.... + // str q19, [x2, #-32] // ............................................................................................................................................................*... + // str q28, [x2, #-16] // ..............................................................................................................................................................*. + // add x2, x2, #64 // ...............................................................................................................................................................* + + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + ldr q18, [x0, #384] // .*.................................................. + ldr q10, [x0, #256] // ..*................................................. + ldr q22, [x0, #640] // *................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + ldr q30, [x0, #512] // ...*................................................ + ldr q27, [x0, #896] // ....*............................................... + ldr q13, [x0, #768] // .....*.............................................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + ldr q15, [x0, #0] // ......*............................................. + ldr q24, [x0, #128] // .......*............................................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v6.4S, v10.4S, v18.4S // ...............*.................................... + add v10.4S, v10.4S, v18.4S // ........*........................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v7.4S, v13.4S, v27.4S // .........*.......................................... + sub v18.4S, v30.4S, v22.4S // ............*....................................... + add v22.4S, v30.4S, v22.4S // ...........*........................................ + add v30.4S, v13.4S, v27.4S // ..........*......................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v27.4S, v15.4S, v24.4S // .............*...................................... + add v13.4S, v15.4S, v24.4S // ..............*..................................... + mul v15.4S, v6.4S, v2.S[0] // .......................*............................ + sqrdmulh v24.4S, v6.4S, v2.S[1] // .........................*.......................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mul v6.4S, v7.4S, v3.S[0] // ..................*................................. + sqrdmulh v7.4S, v7.4S, v3.S[1] // ...................*................................ + sub v11.4S, v22.4S, v30.4S // ................*................................... + mul v23.4S, v18.4S, v2.S[2] // .................*.................................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v18.4S, v18.4S, v2.S[3] // .....................*.............................. + sqrdmulh v12.4S, v27.4S, v1.S[3] // ......................*............................. + mul v27.4S, v27.4S, v1.S[2] // ........................*........................... + sub v31.4S, v13.4S, v10.4S // ....................*............................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + add v10.4S, v13.4S, v10.4S // ........................................*........... + mls v15.4S, v24.4S, v8.S[0] // ..............................*..................... + sqrdmulh v13.4S, v11.4S, v1.S[1] // ..........................*......................... + mul v24.4S, v11.4S, v1.S[0] // ............................*....................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + add v22.4S, v22.4S, v30.4S // .........................................*.......... + mls v6.4S, v7.4S, v8.S[0] // .............................*...................... + sqrdmulh v7.4S, v31.4S, v0.S[3] // ...............................*.................... + mul v30.4S, v31.4S, v0.S[2] // .................................*.................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v23.4S, v18.4S, v8.S[0] // ...........................*........................ + mls v27.4S, v12.4S, v8.S[0] // ................................*................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v16.4S, v10.4S, v22.4S // ................................................*... + add v17.4S, v10.4S, v22.4S // .................................................*.. + mls v24.4S, v13.4S, v8.S[0] // ....................................*............... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v30.4S, v7.4S, v8.S[0] // .......................................*............ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v10.4S, v27.4S, v15.4S // .....................................*.............. + add v22.4S, v27.4S, v15.4S // ......................................*............. + add v7.4S, v23.4S, v6.4S // ..................................*................. + sub v13.4S, v23.4S, v6.4S // ...................................*................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v19.4S, v10.4S, v0.S[3] // ...........................................*........ + mul v27.4S, v10.4S, v0.S[2] // .............................................*...... + mul v6.4S, v13.4S, v1.S[0] // ..........................................*......... + sub v20.4S, v22.4S, v7.4S // ............................................*....... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v12.4S, v30.4S, v24.4S // ..................................................*. + add v18.4S, v30.4S, v24.4S // ...................................................* + add v21.4S, v22.4S, v7.4S // ..............................................*..... + sqrdmulh v30.4S, v13.4S, v1.S[1] // ...............................................*.... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + + // original source code + // ldr q23, [x0, #640] // ..*................................................. + // ldr q22, [x0, #384] // *................................................... + // ldr q5, [x0, #256] // .*.................................................. + // ldr q20, [x0, #512] // ...*................................................ + // ldr q28, [x0, #896] // ....*............................................... + // ldr q21, [x0, #768] // .....*.............................................. + // ldr q9, [x0, #0] // ......*............................................. + // ldr q16, [x0, #128] // .......*............................................ + // add v19.4S, v5.4S, v22.4S // .........*.......................................... + // sub v15.4S, v21.4S, v28.4S // ..........*......................................... + // add v4.4S, v21.4S, v28.4S // .............*...................................... + // add v6.4S, v20.4S, v23.4S // ............*....................................... + // sub v28.4S, v20.4S, v23.4S // ...........*........................................ + // sub v23.4S, v9.4S, v16.4S // ..............*..................................... + // add v12.4S, v9.4S, v16.4S // ...............*.................................... + // sub v9.4S, v5.4S, v22.4S // ........*........................................... + // sub v20.4S, v6.4S, v4.4S // ....................*............................... + // mul v27.4S, v28.4S, v2.S[2] // .....................*.............................. + // mul v13.4S, v15.4S, v3.S[0] // ..................*................................. + // sqrdmulh v21.4S, v15.4S, v3.S[1] // ...................*................................ + // sub v5.4S, v12.4S, v19.4S // .........................*.......................... + // sqrdmulh v15.4S, v28.4S, v2.S[3] // ......................*............................. + // sqrdmulh v28.4S, v23.4S, v1.S[3] // .......................*............................ + // mul v30.4S, v9.4S, v2.S[0] // ................*................................... + // mul v7.4S, v23.4S, v1.S[2] // ........................*........................... + // sqrdmulh v23.4S, v9.4S, v2.S[1] // .................*.................................. + // sqrdmulh v9.4S, v20.4S, v1.S[1] // ............................*....................... + // mls v27.4S, v15.4S, v8.S[0] // ..................................*................. + // mul v24.4S, v20.4S, v1.S[0] // .............................*...................... + // mls v13.4S, v21.4S, v8.S[0] // ...............................*.................... + // mls v30.4S, v23.4S, v8.S[0] // ...........................*........................ + // sqrdmulh v23.4S, v5.4S, v0.S[3] // ................................*................... + // mls v7.4S, v28.4S, v8.S[0] // ...................................*................ + // mul v15.4S, v5.4S, v0.S[2] // .................................*.................. + // add v28.4S, v27.4S, v13.4S // ..........................................*......... + // sub v13.4S, v27.4S, v13.4S // ...........................................*........ + // mls v24.4S, v9.4S, v8.S[0] // ......................................*............. + // sub v9.4S, v7.4S, v30.4S // ........................................*........... + // add v7.4S, v7.4S, v30.4S // .........................................*.......... + // mls v15.4S, v23.4S, v8.S[0] // .......................................*............ + // add v12.4S, v12.4S, v19.4S // ..........................*......................... + // add v5.4S, v6.4S, v4.4S // ..............................*..................... + // mul v6.4S, v13.4S, v1.S[0] // ..............................................*..... + // sqrdmulh v19.4S, v9.4S, v0.S[3] // ............................................*....... + // sub v20.4S, v7.4S, v28.4S // ...............................................*.... + // mul v27.4S, v9.4S, v0.S[2] // .............................................*...... + // add v21.4S, v7.4S, v28.4S // ..................................................*. + // sqrdmulh v30.4S, v13.4S, v1.S[1] // ...................................................* + // sub v16.4S, v12.4S, v5.4S // ....................................*............... + // add v17.4S, v12.4S, v5.4S // .....................................*.............. + // sub v12.4S, v15.4S, v24.4S // ................................................*... + // add v18.4S, v15.4S, v24.4S // .................................................*.. + + sub count, count, #1 +layer123_start: + sqrdmulh v13.4S, v21.4S, v26.4S // ....................................................................................*........... + // gap // ................................................................................................ + ldr q23, [x0, #656] // .....e.......................................................................................... + ldr q22, [x0, #400] // ...e............................................................................................ + sqrdmulh v15.4S, v20.4S, v0.S[1] // ........................................................*....................................... + mul v7.4S, v21.4S, v25.4S // ...................................................................................*............ + mul v10.4S, v20.4S, v0.S[0] // .......................................................*........................................ + ldr q5, [x0, #272] // ..e............................................................................................. + mul v14.4S, v16.4S, v0.S[0] // ..................................................*............................................. + mls v6.4S, v30.4S, v8.S[0] // ...............................................*................................................ + ldr q20, [x0, #528] // ....e........................................................................................... + ldr q28, [x0, #912] // .......e........................................................................................ + // gap // ................................................................................................ + ldr q21, [x0, #784] // ......e......................................................................................... + mls v27.4S, v19.4S, v8.S[0] // .....................................*.......................................................... + sqrdmulh v4.4S, v16.4S, v0.S[1] // ...................................................*............................................ + ldr q9, [x0, #16] // e............................................................................................... + ldr q16, [x0, #144] // .e.............................................................................................. + mul v11.4S, v17.4S, v25.4S // ................................................................................*............... + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v31.4S, v18.4S, v25.4S // ......................................................................................*......... + sqrdmulh v30.4S, v12.4S, v0.S[1] // .............................................................*.................................. + sqrdmulh v24.4S, v18.4S, v26.4S // .......................................................................................*........ + mul v18.4S, v12.4S, v0.S[0] // ............................................................*................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v7.4S, v13.4S, v8.S[0] // .....................................................................................*.......... + sqrdmulh v17.4S, v17.4S, v26.4S // .................................................................................*.............. + mls v10.4S, v15.4S, v8.S[0] // .........................................................*...................................... + add v19.4S, v5.4S, v22.4S // ..............e................................................................................. + mls v14.4S, v4.4S, v8.S[0] // ....................................................*........................................... + sub v13.4S, v27.4S, v6.4S // ...............................................................*................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + add v29.4S, v27.4S, v6.4S // ................................................................*............................... + // gap // ................................................................................................ + sub v15.4S, v21.4S, v28.4S // .......................e........................................................................ + add v4.4S, v21.4S, v28.4S // ........................e....................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + add v6.4S, v20.4S, v23.4S // ...................e............................................................................ + sub v28.4S, v20.4S, v23.4S // ..................e............................................................................. + sub v23.4S, v9.4S, v16.4S // ........e....................................................................................... + add v12.4S, v9.4S, v16.4S // .........e...................................................................................... + sub v9.4S, v5.4S, v22.4S // .............e.................................................................................. + // gap // ................................................................................................ + mul v22.4S, v13.4S, v0.S[0] // .................................................................*.............................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sub v20.4S, v6.4S, v4.4S // ......................................e......................................................... + // gap // ................................................................................................ + mul v27.4S, v28.4S, v2.S[2] // ....................e........................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v16.4S, v13.4S, v0.S[1] // ..................................................................*............................. + mul v13.4S, v15.4S, v3.S[0] // .........................e...................................................................... + sqrdmulh v21.4S, v15.4S, v3.S[1] // ..........................e..................................................................... + sub v5.4S, v12.4S, v19.4S // ............................e................................................................... + sqrdmulh v15.4S, v28.4S, v2.S[3] // .....................e.......................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + str q7, [x0, #128] // .............................................................................................*.. + mls v18.4S, v30.4S, v8.S[0] // ..............................................................*................................. + sqrdmulh v28.4S, v23.4S, v1.S[3] // ...........e.................................................................................... + mul v30.4S, v9.4S, v2.S[0] // ...............e................................................................................ + mul v7.4S, v23.4S, v1.S[2] // ..........e..................................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v23.4S, v9.4S, v2.S[1] // ................e............................................................................... + sqrdmulh v9.4S, v20.4S, v1.S[1] // .........................................e...................................................... + mls v22.4S, v16.4S, v8.S[0] // ...................................................................*............................ + // gap // ................................................................................................ + srshr v16.4S, v14.4S, #23 // ....................................................................*........................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v31.4S, v24.4S, v8.S[0] // ........................................................................................*....... + mls v27.4S, v15.4S, v8.S[0] // ......................e......................................................................... + mul v24.4S, v20.4S, v1.S[0] // ........................................e....................................................... + mls v13.4S, v21.4S, v8.S[0] // ...........................e.................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + srshr v21.4S, v10.4S, #23 // ......................................................................*......................... + // gap // ................................................................................................ + // gap // ................................................................................................ + srshr v20.4S, v18.4S, #23 // ........................................................................*....................... + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v30.4S, v23.4S, v8.S[0] // .................e.............................................................................. + sqrdmulh v23.4S, v5.4S, v0.S[3] // ...............................e................................................................ + mls v7.4S, v28.4S, v8.S[0] // ............e................................................................................... + mul v15.4S, v5.4S, v0.S[2] // ..............................e................................................................. + srshr v5.4S, v22.4S, #23 // ..........................................................................*..................... + str q31, [x0, #256] // ..............................................................................................*. + mul v31.4S, v29.4S, v25.4S // .........................................................................................*...... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v29.4S, v29.4S, v26.4S // ..........................................................................................*..... + add v28.4S, v27.4S, v13.4S // ............................................e................................................... + mls v11.4S, v17.4S, v8.S[0] // ..................................................................................*............. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v10.4S, v21.4S, v8.4S // .......................................................................*........................ + sub v13.4S, v27.4S, v13.4S // ...........................................e.................................................... + // gap // ................................................................................................ + mls v24.4S, v9.4S, v8.S[0] // ..........................................e..................................................... + mls v14.4S, v16.4S, v8.4S // .....................................................................*.......................... + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v18.4S, v20.4S, v8.4S // .........................................................................*...................... + // gap // ................................................................................................ + sub v9.4S, v7.4S, v30.4S // .................................e.............................................................. + add v7.4S, v7.4S, v30.4S // ..................................e............................................................. + mls v31.4S, v29.4S, v8.S[0] // ...........................................................................................*.... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v22.4S, v5.4S, v8.4S // ...........................................................................*.................... + mls v15.4S, v23.4S, v8.S[0] // ................................e............................................................... + add v12.4S, v12.4S, v19.4S // .............................e.................................................................. + str q10, [x0, #640] // .............................................................................*.................. + // gap // ................................................................................................ + str q11, [x0], #(16) // ............................................................................................*... + add v5.4S, v6.4S, v4.4S // .......................................e........................................................ + // gap // ................................................................................................ + mul v6.4S, v13.4S, v1.S[0] // .............................................e.................................................. + sqrdmulh v19.4S, v9.4S, v0.S[3] // ....................................e........................................................... + str q14, [x0, #496] // ............................................................................*................... + sub v20.4S, v7.4S, v28.4S // .....................................................e.......................................... + mul v27.4S, v9.4S, v0.S[2] // ...................................e............................................................ + add v21.4S, v7.4S, v28.4S // ......................................................e......................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v30.4S, v13.4S, v1.S[1] // ..............................................e................................................. + str q18, [x0, #752] // ..............................................................................*................. + sub v16.4S, v12.4S, v5.4S // ................................................e............................................... + add v17.4S, v12.4S, v5.4S // .................................................e.............................................. + str q31, [x0, #368] // ...............................................................................................* + str q22, [x0, #880] // ...............................................................................*................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sub v12.4S, v15.4S, v24.4S // ..........................................................e..................................... + add v18.4S, v15.4S, v24.4S // ...........................................................e.................................... + + // original source code + // ldr q9, [x0, #0] // .............e.................................................................................|.............e............................................................................... + // ldr q10, [x0, #(1*(1024/8))] // ..............e................................................................................|..............e.............................................................................. + // ldr q11, [x0, #(2*(1024/8))] // .....e.........................................................................................|.....e....................................................................................... + // ldr q12, [x0, #(3*(1024/8))] // .e.............................................................................................|.e........................................................................................... + // ldr q13, [x0, #(4*(1024/8))] // ........e......................................................................................|........e.................................................................................... + // ldr q14, [x0, #(5*(1024/8))] // e..............................................................................................|e............................................................................................ + // ldr q15, [x0, #(6*(1024/8))] // ..........e....................................................................................|..........e.................................................................................. + // ldr q16, [x0, #(7*(1024/8))] // .........e.....................................................................................|.........e................................................................................... + // sub v24.4s, v9.4s, v10.4s // ...............................e...............................................................|...............................e............................................................. + // add v9.4s, v9.4s, v10.4s // ................................e..............................................................|................................e............................................................ + // mul v10.4s, v24.4s, v1.s[2] // ..............................................e................................................|..............................................e.............................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ............................................e..................................................|............................................e................................................ + // mls v10.4s, v24.4s, v8.s[0] // ...........................................................e...................................|...........................................................e................................. + // sub v24.4s, v11.4s, v12.4s // .................................e.............................................................|.................................e........................................................... + // add v11.4s, v11.4s, v12.4s // .......................e.......................................................................|.......................e..................................................................... + // mul v12.4s, v24.4s, v2.s[0] // .............................................e.................................................|.............................................e............................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...............................................e...............................................|...............................................e............................................. + // mls v12.4s, v24.4s, v8.s[0] // .........................................................e.....................................|.........................................................e................................... + // sub v24.4s, v13.4s, v14.4s // ..............................e................................................................|..............................e.............................................................. + // add v13.4s, v13.4s, v14.4s // .............................e.................................................................|.............................e............................................................... + // mul v14.4s, v24.4s, v2.s[2] // ....................................e..........................................................|....................................e........................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // .........................................e.....................................................|.........................................e................................................... + // mls v14.4s, v24.4s, v8.s[0] // ....................................................e..........................................|....................................................e........................................ + // sub v24.4s, v15.4s, v16.4s // ...........................e...................................................................|...........................e................................................................. + // add v15.4s, v15.4s, v16.4s // ............................e..................................................................|............................e................................................................ + // mul v16.4s, v24.4s, v3.s[0] // ......................................e........................................................|......................................e...................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .......................................e.......................................................|.......................................e..................................................... + // mls v16.4s, v24.4s, v8.s[0] // ......................................................e........................................|......................................................e...................................... + // sub v24.4s, v9.4s, v11.4s // ........................................e......................................................|........................................e.................................................... + // add v9.4s, v9.4s, v11.4s // .............................................................................e.................|.............................................................................e............... + // mul v11.4s, v24.4s, v0.s[2] // ............................................................e..................................|............................................................e................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..........................................................e....................................|..........................................................e.................................. + // mls v11.4s, v24.4s, v8.s[0] // ............................................................................e..................|............................................................................e................ + // sub v24.4s, v10.4s, v12.4s // ........................................................................e......................|........................................................................e.................... + // add v10.4s, v10.4s, v12.4s // .........................................................................e.....................|.........................................................................e................... + // mul v12.4s, v24.4s, v0.s[2] // .....................................................................................e.........|.....................................................................................e....... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................................................................e............|..................................................................................e.......... + // mls v12.4s, v24.4s, v8.s[0] // ...........*...................................................................................|...........*................................................................................. + // sub v24.4s, v13.4s, v15.4s // ...................................e...........................................................|...................................e......................................................... + // add v13.4s, v13.4s, v15.4s // ................................................................................e..............|................................................................................e............ + // mul v15.4s, v24.4s, v1.s[0] // .....................................................e.........................................|.....................................................e....................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................................................e..............................................|................................................e............................................ + // mls v15.4s, v24.4s, v8.s[0] // .....................................................................e.........................|.....................................................................e....................... + // sub v24.4s, v14.4s, v16.4s // ....................................................................e..........................|....................................................................e........................ + // add v14.4s, v14.4s, v16.4s // .................................................................e.............................|.................................................................e........................... + // mul v16.4s, v24.4s, v1.s[0] // .................................................................................e.............|.................................................................................e........... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .......................................................................................e.......|.......................................................................................e..... + // mls v16.4s, v24.4s, v8.s[0] // .......*.......................................................................................|.......*..................................................................................... + // sub v24.4s, v9.4s, v13.4s // .........................................................................................e.....|.........................................................................................e... + // add v9.4s, v9.4s, v13.4s // ..........................................................................................e....|..........................................................................................e.. + // mul v13.4s, v24.4s, v0.s[0] // ......*........................................................................................|......*...................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............*..................................................................................|............*................................................................................ + // mls v13.4s, v24.4s, v8.s[0] // ........................*......................................................................|........................*.................................................................... + // sub v24.4s, v10.4s, v14.4s // ....................................................................................e..........|....................................................................................e........ + // add v10.4s, v10.4s, v14.4s // ......................................................................................e........|......................................................................................e...... + // mul v14.4s, v24.4s, v0.s[0] // ....*..........................................................................................|....*........................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..*............................................................................................|..*.......................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ......................*........................................................................|......................*...................................................................... + // sub v24.4s, v11.4s, v15.4s // .............................................................................................e.|............................................................................................. + // add v11.4s, v11.4s, v15.4s // ..............................................................................................e|............................................................................................. + // mul v15.4s, v24.4s, v0.s[0] // ...................*...........................................................................|...................*......................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................*.............................................................................|.................*........................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ...........................................*...................................................|...........................................*................................................. + // sub v24.4s, v12.4s, v16.4s // .........................*.....................................................................|.........................*................................................................... + // add v12.4s, v12.4s, v16.4s // ..........................*....................................................................|..........................*.................................................................. + // mul v16.4s, v24.4s, v0.s[0] // ..................................*............................................................|..................................*.......................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................*.........................................................|.....................................*....................................................... + // mls v16.4s, v24.4s, v8.s[0] // .................................................*.............................................|.................................................*........................................... + // srshr v24.4S, v13.4S, #23 // ..................................................*............................................|..................................................*.......................................... + // mls v13.4s, v24.4s, v8.4s // ......................................................................*........................|......................................................................*...................... + // srshr v24.4S, v14.4S, #23 // .......................................................*.......................................|.......................................................*..................................... + // mls v14.4s, v24.4s, v8.4s // ...................................................................*...........................|...................................................................*......................... + // srshr v24.4S, v15.4S, #23 // ........................................................*......................................|........................................................*.................................... + // mls v15.4s, v24.4s, v8.4s // .......................................................................*.......................|.......................................................................*..................... + // srshr v24.4S, v16.4S, #23 // .............................................................*.................................|.............................................................*............................... + // mls v16.4s, v24.4s, v8.4s // ...........................................................................*...................|...........................................................................*................. + // str q13, [x0, #(4*(1024/8))] // ...................................................................................*...........|...................................................................................*......... + // str q14, [x0, #(5*(1024/8))] // ..............................................................................*................|..............................................................................*.............. + // str q15, [x0, #(6*(1024/8))] // ........................................................................................*......|........................................................................................*.... + // str q16, [x0, #(7*(1024/8))] // ............................................................................................*..|............................................................................................* + // mul v13.4s, v9.4s, v25.4s // ...............*...............................................................................|...............*............................................................................. + // sqrdmulh v9.4s, v9.4s, v26.4s // .....................*.........................................................................|.....................*....................................................................... + // mls v13.4s, v9.4s, v8.s[0] // ..................................................................*............................|..................................................................*.......................... + // mul v14.4s, v10.4s, v25.4s // ...*...........................................................................................|...*......................................................................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ...............................................................................................*............................................................................................. + // mls v14.4s, v10.4s, v8.s[0] // ....................*..........................................................................|....................*........................................................................ + // mul v15.4s, v11.4s, v25.4s // ................*..............................................................................|................*............................................................................ + // sqrdmulh v11.4s, v11.4s, v26.4s // ..................*............................................................................|..................*.......................................................................... + // mls v15.4s, v11.4s, v8.s[0] // ...................................................*...........................................|...................................................*......................................... + // mul v16.4s, v12.4s, v25.4s // ...............................................................*...............................|...............................................................*............................. + // sqrdmulh v12.4s, v12.4s, v26.4s // ................................................................*..............................|................................................................*............................ + // mls v16.4s, v12.4s, v8.s[0] // ..........................................................................*....................|..........................................................................*.................. + // str q13, [x0], #(16) // ...............................................................................*...............|...............................................................................*............. + // str q14, [x0, #(-16 + 1*(1024/8))] // ..........................................*....................................................|..........................................*.................................................. + // str q15, [x0, #(-16 + 2*(1024/8))] // ..............................................................*................................|..............................................................*.............................. + // str q16, [x0, #(-16 + 3*(1024/8))] // ...........................................................................................*...|...........................................................................................*. + + sub count, count, #1 + cbnz count, layer123_start + sqrdmulh v9.4S, v18.4S, v26.4S // ...........*................................ + mls v6.4S, v30.4S, v8.S[0] // .....*...................................... + mls v27.4S, v19.4S, v8.S[0] // ......*..................................... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + sqrdmulh v30.4S, v17.4S, v26.4S // ..............*............................. + // gap // ............................................ + mul v5.4S, v20.4S, v0.S[0] // ...*........................................ + sqrdmulh v31.4S, v20.4S, v0.S[1] // .*.......................................... + mul v11.4S, v16.4S, v0.S[0] // ....*....................................... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + sqrdmulh v24.4S, v16.4S, v0.S[1] // .......*.................................... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + mul v23.4S, v18.4S, v25.4S // .........*.................................. + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + sub v16.4S, v27.4S, v6.4S // .................*.......................... + mul v29.4S, v12.4S, v0.S[0] // ............*............................... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + add v13.4S, v27.4S, v6.4S // ..................*......................... + sqrdmulh v14.4S, v12.4S, v0.S[1] // ..........*................................. + mls v5.4S, v31.4S, v8.S[0] // ...............*............................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + mul v27.4S, v16.4S, v0.S[0] // ...................*........................ + // gap // ............................................ + // gap // ............................................ + sqrdmulh v31.4S, v16.4S, v0.S[1] // ....................*....................... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + mul v12.4S, v13.4S, v25.4S // ..............................*............. + sqrdmulh v20.4S, v13.4S, v26.4S // ...............................*............ + mul v4.4S, v17.4S, v25.4S // ........*................................... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + mls v11.4S, v24.4S, v8.S[0] // ................*........................... + mls v29.4S, v14.4S, v8.S[0] // ......................*..................... + srshr v13.4S, v5.4S, #23 // ..........................*................. + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + mls v27.4S, v31.4S, v8.S[0] // .......................*.................... + mul v31.4S, v21.4S, v25.4S // ..*......................................... + mls v23.4S, v9.4S, v8.S[0] // .........................*.................. + sqrdmulh v9.4S, v21.4S, v26.4S // *........................................... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + mls v4.4S, v30.4S, v8.S[0] // ................................*........... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + mls v5.4S, v13.4S, v8.4S // .................................*.......... + srshr v13.4S, v11.4S, #23 // ........................*................... + srshr v30.4S, v29.4S, #23 // ...........................*................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + mls v12.4S, v20.4S, v8.S[0] // ....................................*....... + mls v31.4S, v9.4S, v8.S[0] // .............*.............................. + str q23, [x0, #256] // .............................*.............. + srshr v6.4S, v27.4S, #23 // ............................*............... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + str q4, [x0], #(16) // .......................................*.... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + mls v11.4S, v13.4S, v8.4S // ..................................*......... + mls v29.4S, v30.4S, v8.4S // ...................................*........ + str q5, [x0, #624] // ......................................*..... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + mls v27.4S, v6.4S, v8.4S // .....................................*...... + str q31, [x0, #112] // .....................*...................... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + str q12, [x0, #368] // ..........................................*. + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + str q11, [x0, #496] // ........................................*... + str q29, [x0, #752] // .........................................*.. + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + str q27, [x0, #880] // ...........................................* + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + + // original source code + // sqrdmulh v13.4S, v21.4S, v26.4S // .........................*.................. + // sqrdmulh v15.4S, v20.4S, v0.S[1] // .....*...................................... + // mul v7.4S, v21.4S, v25.4S // .......................*.................... + // mul v10.4S, v20.4S, v0.S[0] // ....*....................................... + // mul v14.4S, v16.4S, v0.S[0] // ......*..................................... + // mls v6.4S, v30.4S, v8.S[0] // .*.......................................... + // mls v27.4S, v19.4S, v8.S[0] // ..*......................................... + // sqrdmulh v4.4S, v16.4S, v0.S[1] // .......*.................................... + // mul v11.4S, v17.4S, v25.4S // ..................*......................... + // mul v31.4S, v18.4S, v25.4S // ........*................................... + // sqrdmulh v30.4S, v12.4S, v0.S[1] // ............*............................... + // sqrdmulh v24.4S, v18.4S, v26.4S // *........................................... + // mul v18.4S, v12.4S, v0.S[0] // ..........*................................. + // mls v7.4S, v13.4S, v8.S[0] // ...............................*............ + // sqrdmulh v17.4S, v17.4S, v26.4S // ...*........................................ + // mls v10.4S, v15.4S, v8.S[0] // .............*.............................. + // mls v14.4S, v4.4S, v8.S[0] // ...................*........................ + // sub v13.4S, v27.4S, v6.4S // .........*.................................. + // add v29.4S, v27.4S, v6.4S // ...........*................................ + // mul v22.4S, v13.4S, v0.S[0] // ..............*............................. + // sqrdmulh v16.4S, v13.4S, v0.S[1] // ...............*............................ + // str q7, [x0, #128] // .......................................*.... + // mls v18.4S, v30.4S, v8.S[0] // ....................*....................... + // mls v22.4S, v16.4S, v8.S[0] // ......................*..................... + // srshr v16.4S, v14.4S, #23 // ............................*............... + // mls v31.4S, v24.4S, v8.S[0] // ........................*................... + // srshr v21.4S, v10.4S, #23 // .....................*...................... + // srshr v20.4S, v18.4S, #23 // .............................*.............. + // srshr v5.4S, v22.4S, #23 // .................................*.......... + // str q31, [x0, #256] // ................................*........... + // mul v31.4S, v29.4S, v25.4S // ................*........................... + // sqrdmulh v29.4S, v29.4S, v26.4S // .................*.......................... + // mls v11.4S, v17.4S, v8.S[0] // ..........................*................. + // mls v10.4S, v21.4S, v8.4S // ...........................*................ + // mls v14.4S, v16.4S, v8.4S // ...................................*........ + // mls v18.4S, v20.4S, v8.4S // ....................................*....... + // mls v31.4S, v29.4S, v8.S[0] // ..............................*............. + // mls v22.4S, v5.4S, v8.4S // ......................................*..... + // str q10, [x0, #640] // .....................................*...... + // str q11, [x0], #(16) // ..................................*......... + // str q14, [x0, #496] // .........................................*.. + // str q18, [x0, #752] // ..........................................*. + // str q31, [x0, #368] // ........................................*... + // str q22, [x0, #880] // ...........................................* + + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a55.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a55.s new file mode 100644 index 0000000..7caf605 --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a55.s @@ -0,0 +1,2038 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_opt_a55 + .global _intt_dilithium_123_45678_opt_a55 + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_opt_a55: +_intt_dilithium_123_45678_opt_a55: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [x2] // .............*.......................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q0, [x5, #144] // .........*.............................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v21.4S, v16.4S, v17.4S // .......................*................................................................ + // gap // ........................................................................................ + ldr q23, [x5, #160] // ..........*............................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v2.4S, v21.4S, v0.4S // .....................................*.................................................. + // gap // ........................................................................................ + sub v0.4S, v18.4S, v19.4S // ......................................*................................................. + // gap // ........................................................................................ + ldr q26, [x5, #176] // ...........*............................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v9.4S, v0.4S, v23.4S // ..........................................*............................................. + // gap // ........................................................................................ + add v7.4S, v16.4S, v17.4S // ...........................*............................................................ + // gap // ........................................................................................ + sqrdmulh v0.4S, v0.4S, v26.4S // ...........................................*............................................ + // gap // ........................................................................................ + ldr q1, [x5], #(12*16) // *....................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v4.4S, v18.4S, v19.4S // .......................................*................................................ + // gap // ........................................................................................ + mls v9.4S, v0.4S, v8.S[0] // ...............................................*........................................ + // gap // ........................................................................................ + ld4 {v11.4S, v12.4S, v13.4S, v14.4S}, [x1] // ............*........................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q3, [x5, #-80] // .......*................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q10, [x5, #-144] // ...*.................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q6, [x5, #-128] // ....*................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v16.4S, v13.4S, v14.4S // ................*....................................................................... + // gap // ........................................................................................ + ldr q0, [x5, #-112] // .....*.................................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v17.4S, v16.4S, v6.4S // ...................*.................................................................... + // gap // ........................................................................................ + add v26.4S, v11.4S, v12.4S // ...............*........................................................................ + // gap // ........................................................................................ + sqrdmulh v16.4S, v16.4S, v0.4S // ....................*................................................................... + // gap // ........................................................................................ + add v0.4S, v13.4S, v14.4S // .....................*.................................................................. + // gap // ........................................................................................ + ldr q28, [x5, #-176] // .*...................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v17.4S, v16.4S, v8.S[0] // ........................*............................................................... + // gap // ........................................................................................ + sub v23.4S, v26.4S, v0.4S // .........................*.............................................................. + // gap // ........................................................................................ + sub v11.4S, v11.4S, v12.4S // ..............*......................................................................... + // gap // ........................................................................................ + add v6.4S, v26.4S, v0.4S // ..........................*............................................................. + // gap // ........................................................................................ + sqrdmulh v16.4S, v23.4S, v28.4S // ..............................*......................................................... + // gap // ........................................................................................ + sqrdmulh v30.4S, v11.4S, v10.4S // ..................*..................................................................... + // gap // ........................................................................................ + ldr q25, [x5, #-160] // ..*..................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v13.4S, v23.4S, v1.4S // .............................*.......................................................... + // gap // ........................................................................................ + sub v20.4S, v7.4S, v4.4S // ............................................*........................................... + // gap // ........................................................................................ + mul v0.4S, v11.4S, v25.4S // .................*...................................................................... + // gap // ........................................................................................ + ldr q18, [x5, #-64] // ........*............................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v26.4S, v20.4S, v3.4S // .................................................*...................................... + // gap // ........................................................................................ + mls v0.4S, v30.4S, v8.S[0] // ......................*................................................................. + // gap // ........................................................................................ + mul v11.4S, v21.4S, v18.4S // ...................................*.................................................... + // gap // ........................................................................................ + ldr q10, [x5, #-96] // ......*................................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v30.4S, v0.4S, v17.4S // ............................*........................................................... + // gap // ........................................................................................ + mls v11.4S, v2.4S, v8.S[0] // .........................................*.............................................. + // gap // ........................................................................................ + add v21.4S, v0.4S, v17.4S // .................................*...................................................... + // gap // ........................................................................................ + sqrdmulh v2.4S, v30.4S, v28.4S // ................................*....................................................... + // gap // ........................................................................................ + mul v25.4S, v30.4S, v1.4S // ...............................*........................................................ + // gap // ........................................................................................ + sub v0.4S, v11.4S, v9.4S // ...................................................*.................................... + // gap // ........................................................................................ + trn2 v17.4S, v6.4S, v21.4S // ..............................................*......................................... + // gap // ........................................................................................ + mls v13.4S, v16.4S, v8.S[0] // ..................................*..................................................... + // gap // ........................................................................................ + mls v25.4S, v2.4S, v8.S[0] // ....................................*................................................... + // gap // ........................................................................................ + sqrdmulh v16.4S, v0.4S, v3.4S // .......................................................*................................ + // gap // ........................................................................................ + mul v23.4S, v0.4S, v10.4S // ......................................................*................................. + // gap // ........................................................................................ + trn1 v30.4S, v6.4S, v21.4S // ........................................*............................................... + // gap // ........................................................................................ + trn2 v2.4S, v13.4S, v25.4S // ........................................................*............................... + // gap // ........................................................................................ + trn1 v21.4S, v13.4S, v25.4S // ..................................................*..................................... + // gap // ........................................................................................ + mul v25.4S, v20.4S, v10.4S // ................................................*....................................... + // gap // ........................................................................................ + trn2 v20.2D, v17.2D, v2.2D // ............................................................*........................... + // gap // ........................................................................................ + trn1 v0.2D, v17.2D, v2.2D // .............................................................*.......................... + // gap // ........................................................................................ + trn2 v17.2D, v30.2D, v21.2D // .........................................................*.............................. + // gap // ........................................................................................ + mls v25.4S, v26.4S, v8.S[0] // .....................................................*.................................. + // gap // ........................................................................................ + mls v23.4S, v16.4S, v8.S[0] // ...........................................................*............................ + // gap // ........................................................................................ + add v26.4S, v7.4S, v4.4S // .............................................*.......................................... + // gap // ........................................................................................ + add v4.4S, v11.4S, v9.4S // ....................................................*................................... + // gap // ........................................................................................ + trn1 v30.2D, v30.2D, v21.2D // ..........................................................*............................. + // gap // ........................................................................................ + trn2 v16.4S, v25.4S, v23.4S // .................................................................*...................... + // gap // ........................................................................................ + trn2 v2.4S, v26.4S, v4.4S // ................................................................*....................... + // gap // ........................................................................................ + trn1 v26.4S, v26.4S, v4.4S // ..............................................................*......................... + // gap // ........................................................................................ + trn1 v23.4S, v25.4S, v23.4S // ...............................................................*........................ + // gap // ........................................................................................ + trn2 v4.2D, v2.2D, v16.2D // ....................................................................*................... + // gap // ........................................................................................ + trn1 v16.2D, v2.2D, v16.2D // .....................................................................*.................. + // gap // ........................................................................................ + trn1 v21.2D, v26.2D, v23.2D // ...................................................................*.................... + // gap // ........................................................................................ + trn2 v25.2D, v26.2D, v23.2D // ..................................................................*..................... + // gap // ........................................................................................ + add v23.4S, v21.4S, v16.4S // .......................................................................*................ + // gap // ........................................................................................ + add v2.4S, v25.4S, v4.4S // ......................................................................*................. + // gap // ........................................................................................ + add v11.4S, v30.4S, v0.4S // ........................................................................*............... + // gap // ........................................................................................ + add v13.4S, v17.4S, v20.4S // .........................................................................*.............. + // gap // ........................................................................................ + add v19.4S, v23.4S, v2.4S // ...........................................................................*............ + // gap // ........................................................................................ + sub v7.4S, v23.4S, v2.4S // ..........................................................................*............. + // gap // ........................................................................................ + add v23.4S, v11.4S, v13.4S // ............................................................................*........... + // gap // ........................................................................................ + ldr q3, [x4], #64 // .............................................................................*.......... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v2.4S, v23.4S, v19.4S // ..............................................................................*......... + // gap // ........................................................................................ + ldr q10, [x4, #-48] // ...............................................................................*........ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v18.4S, v2.4S, v3.S[0] // ................................................................................*....... + // gap // ........................................................................................ + sqrdmulh v26.4S, v2.4S, v3.S[1] // .................................................................................*...... + // gap // ........................................................................................ + add v2.4S, v23.4S, v19.4S // ..................................................................................*..... + // gap // ........................................................................................ + ldr q1, [x4, #-32] // ...................................................................................*.... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v18.4S, v26.4S, v8.S[0] // ....................................................................................*... + // gap // ........................................................................................ + str q2, [x1], #(16*4) // .....................................................................................*.. + // gap // ........................................................................................ + ldr q6, [x4, #-16] // ......................................................................................*. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q18, [x2], #(16*4) // .......................................................................................* + // gap // ........................................................................................ + + // original source code + // ldr q2, [x5], #(12*16) // ..........*............................................................................. + // ldr q21, [x5, #-176] // .......................*................................................................ + // ldr q0, [x5, #-160] // ..............................*......................................................... + // ldr q16, [x5, #-144] // ...............*........................................................................ + // ldr q23, [x5, #-128] // ................*....................................................................... + // ldr q26, [x5, #-112] // ..................*..................................................................... + // ldr q20, [x5, #-96] // ......................................*................................................. + // ldr q17, [x5, #-80] // ..............*......................................................................... + // ldr q30, [x5, #-64] // ..................................*..................................................... + // ldr q25, [x5, #-48] // .*...................................................................................... + // ldr q11, [x5, #-32] // ...*.................................................................................... + // ldr q7, [x5, #-16] // ......*................................................................................. + // ld4 {v12.4S, v13.4S, v14.4S, v15.4S}, [x1] // .............*.......................................................................... + // ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x2] // *....................................................................................... + // sub v10.4S, v12.4S, v13.4S // ..........................*............................................................. + // add v13.4S, v12.4S, v13.4S // ....................*................................................................... + // sub v19.4S, v14.4S, v15.4S // .................*...................................................................... + // mul v0.4S, v10.4S, v0.4S // .................................*...................................................... + // sqrdmulh v16.4S, v10.4S, v16.4S // .............................*.......................................................... + // mul v23.4S, v19.4S, v23.4S // ...................*.................................................................... + // sqrdmulh v26.4S, v19.4S, v26.4S // .....................*.................................................................. + // add v10.4S, v14.4S, v15.4S // ......................*................................................................. + // mls v0.4S, v16.4S, v8.S[0] // ....................................*................................................... + // sub v16.4S, v3.4S, v4.4S // ..*..................................................................................... + // mls v23.4S, v26.4S, v8.S[0] // ........................*............................................................... + // sub v26.4S, v13.4S, v10.4S // .........................*.............................................................. + // add v13.4S, v13.4S, v10.4S // ...........................*............................................................ + // add v4.4S, v3.4S, v4.4S // ........*............................................................................... + // sub v3.4S, v0.4S, v23.4S // .......................................*................................................ + // mul v10.4S, v26.4S, v2.4S // ...............................*........................................................ + // sqrdmulh v26.4S, v26.4S, v21.4S // ............................*........................................................... + // mul v2.4S, v3.4S, v2.4S // ...........................................*............................................ + // sqrdmulh v21.4S, v3.4S, v21.4S // ..........................................*............................................. + // add v0.4S, v0.4S, v23.4S // .........................................*.............................................. + // mls v10.4S, v26.4S, v8.S[0] // ..............................................*......................................... + // mul v23.4S, v16.4S, v30.4S // .....................................*.................................................. + // mls v2.4S, v21.4S, v8.S[0] // ...............................................*........................................ + // sqrdmulh v16.4S, v16.4S, v25.4S // ....*................................................................................... + // sub v21.4S, v5.4S, v6.4S // .....*.................................................................................. + // add v26.4S, v5.4S, v6.4S // ...........*............................................................................ + // trn1 v30.4S, v13.4S, v0.4S // ..................................................*..................................... + // mls v23.4S, v16.4S, v8.S[0] // ........................................*............................................... + // mul v16.4S, v21.4S, v11.4S // .......*................................................................................ + // sqrdmulh v21.4S, v21.4S, v7.4S // .........*.............................................................................. + // sub v25.4S, v4.4S, v26.4S // ................................*....................................................... + // add v26.4S, v4.4S, v26.4S // ...........................................................*............................ + // trn2 v0.4S, v13.4S, v0.4S // .............................................*.......................................... + // mls v16.4S, v21.4S, v8.S[0] // ............*........................................................................... + // mul v21.4S, v25.4S, v20.4S // .....................................................*.................................. + // sqrdmulh v4.4S, v25.4S, v17.4S // ...................................*.................................................... + // trn1 v25.4S, v10.4S, v2.4S // ....................................................*................................... + // sub v11.4S, v23.4S, v16.4S // ............................................*........................................... + // add v16.4S, v23.4S, v16.4S // ............................................................*........................... + // mls v21.4S, v4.4S, v8.S[0] // .........................................................*.............................. + // mul v23.4S, v11.4S, v20.4S // .................................................*...................................... + // sqrdmulh v20.4S, v11.4S, v17.4S // ................................................*....................................... + // trn2 v2.4S, v10.4S, v2.4S // ...................................................*.................................... + // trn2 v17.2D, v30.2D, v25.2D // ........................................................*............................... + // trn1 v30.2D, v30.2D, v25.2D // .............................................................*.......................... + // mls v23.4S, v20.4S, v8.S[0] // ..........................................................*............................. + // trn2 v20.2D, v0.2D, v2.2D // ......................................................*................................. + // trn1 v0.2D, v0.2D, v2.2D // .......................................................*................................ + // trn1 v2.4S, v26.4S, v16.4S // ................................................................*....................... + // trn1 v4.4S, v21.4S, v23.4S // .................................................................*...................... + // trn2 v16.4S, v26.4S, v16.4S // ...............................................................*........................ + // trn2 v23.4S, v21.4S, v23.4S // ..............................................................*......................... + // trn2 v25.2D, v2.2D, v4.2D // .....................................................................*.................. + // trn1 v21.2D, v2.2D, v4.2D // ....................................................................*................... + // trn2 v4.2D, v16.2D, v23.2D // ..................................................................*..................... + // trn1 v16.2D, v16.2D, v23.2D // ...................................................................*.................... + // add v2.4S, v25.4S, v4.4S // .......................................................................*................ + // add v23.4S, v21.4S, v16.4S // ......................................................................*................. + // add v11.4S, v30.4S, v0.4S // ........................................................................*............... + // add v13.4S, v17.4S, v20.4S // .........................................................................*.............. + // sub v7.4S, v23.4S, v2.4S // ...........................................................................*............ + // add v2.4S, v23.4S, v2.4S // ..........................................................................*............. + // add v23.4S, v11.4S, v13.4S // ............................................................................*........... + // ldr q3, [x4], #64 // .............................................................................*.......... + // sub v26.4S, v23.4S, v2.4S // ..............................................................................*......... + // ldr q10, [x4, #-48] // ...............................................................................*........ + // mul v19.4S, v26.4S, v3.S[0] // ................................................................................*....... + // sqrdmulh v26.4S, v26.4S, v3.S[1] // .................................................................................*...... + // add v2.4S, v23.4S, v2.4S // ..................................................................................*..... + // ldr q1, [x4, #-32] // ...................................................................................*.... + // mls v19.4S, v26.4S, v8.S[0] // ....................................................................................*... + // str q2, [x1], #(16*4) // .....................................................................................*.. + // ldr q6, [x4, #-16] // ......................................................................................*. + // str q19, [x2], #(16*4) // .......................................................................................* + + sub count, count, #1 +layer45678_start: + sub v0.4S, v30.4S, v0.4S // ..........................................................................*..................................................................... + // gap // ................................................................................................................................................ + sub v2.4S, v17.4S, v20.4S // ...............................................................................*................................................................ + // gap // ................................................................................................................................................ + sub v16.4S, v21.4S, v16.4S // ....................................................................................*........................................................... + // gap // ................................................................................................................................................ + mul v23.4S, v0.4S, v10.S[2] // ............................................................................*................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v0.4S, v0.4S, v10.S[3] // .............................................................................*.................................................................. + // gap // ................................................................................................................................................ + mul v21.4S, v2.4S, v1.S[0] // .................................................................................*.............................................................. + // gap // ................................................................................................................................................ + sqrdmulh v2.4S, v2.4S, v1.S[1] // ..................................................................................*............................................................. + // gap // ................................................................................................................................................ + mul v26.4S, v16.4S, v1.S[2] // ......................................................................................*......................................................... + // gap // ................................................................................................................................................ + sqrdmulh v16.4S, v16.4S, v1.S[3] // .......................................................................................*........................................................ + // gap // ................................................................................................................................................ + mls v23.4S, v0.4S, v8.S[0] // ..............................................................................*................................................................. + // gap // ................................................................................................................................................ + mls v21.4S, v2.4S, v8.S[0] // ...................................................................................*............................................................ + // gap // ................................................................................................................................................ + sub v0.4S, v25.4S, v4.4S // .........................................................................................*...................................................... + // gap // ................................................................................................................................................ + mls v26.4S, v16.4S, v8.S[0] // ........................................................................................*....................................................... + // gap // ................................................................................................................................................ + sub v2.4S, v11.4S, v13.4S // ..............................................................................................*................................................. + // gap // ................................................................................................................................................ + mul v16.4S, v0.4S, v6.S[0] // ...........................................................................................*.................................................... + // gap // ................................................................................................................................................ + sqrdmulh v0.4S, v0.4S, v6.S[1] // ............................................................................................*................................................... + // gap // ................................................................................................................................................ + mul v20.4S, v2.4S, v3.S[2] // ................................................................................................*............................................... + // gap // ................................................................................................................................................ + sqrdmulh v2.4S, v2.4S, v3.S[3] // .................................................................................................*.............................................. + // gap // ................................................................................................................................................ + sub v17.4S, v23.4S, v21.4S // ...................................................................................................*............................................ + // gap // ................................................................................................................................................ + add v23.4S, v23.4S, v21.4S // ....................................................................................................*........................................... + // gap // ................................................................................................................................................ + mls v16.4S, v0.4S, v8.S[0] // .............................................................................................*.................................................. + // gap // ................................................................................................................................................ + mls v20.4S, v2.4S, v8.S[0] // ..................................................................................................*............................................. + // gap // ................................................................................................................................................ + mul v0.4S, v17.4S, v3.S[2] // .....................................................................................................*.......................................... + // gap // ................................................................................................................................................ + sqrdmulh v2.4S, v17.4S, v3.S[3] // ......................................................................................................*......................................... + // gap // ................................................................................................................................................ + mul v21.4S, v7.4S, v10.S[0] // ..........................................................................................................*..................................... + // gap // ................................................................................................................................................ + sub v17.4S, v26.4S, v16.4S // .............................................................................................................*.................................. + // gap // ................................................................................................................................................ + add v16.4S, v26.4S, v16.4S // ..............................................................................................................*................................. + // gap // ................................................................................................................................................ + mls v0.4S, v2.4S, v8.S[0] // .......................................................................................................*........................................ + // gap // ................................................................................................................................................ + sqrdmulh v2.4S, v7.4S, v10.S[1] // ...........................................................................................................*.................................... + // gap // ................................................................................................................................................ + mul v26.4S, v17.4S, v10.S[0] // ...............................................................................................................*................................ + // gap // ................................................................................................................................................ + sqrdmulh v17.4S, v17.4S, v10.S[1] // ................................................................................................................*............................... + // gap // ................................................................................................................................................ + sub v30.4S, v23.4S, v16.4S // .......................................................................................................................*........................ + // gap // ................................................................................................................................................ + mls v21.4S, v2.4S, v8.S[0] // ............................................................................................................*................................... + // gap // ................................................................................................................................................ + add v2.4S, v23.4S, v16.4S // ........................................................................................................................*....................... + // gap // ................................................................................................................................................ + mls v26.4S, v17.4S, v8.S[0] // .................................................................................................................*.............................. + // gap // ................................................................................................................................................ + mul v16.4S, v30.4S, v3.S[0] // .........................................................................................................................*...................... + // gap // ................................................................................................................................................ + sqrdmulh v23.4S, v30.4S, v3.S[1] // ..........................................................................................................................*..................... + // gap // ................................................................................................................................................ + sub v17.4S, v20.4S, v21.4S // ............................................................................................................................*................... + // gap // ................................................................................................................................................ + add v21.4S, v20.4S, v21.4S // .............................................................................................................................*.................. + // gap // ................................................................................................................................................ + sub v20.4S, v0.4S, v26.4S // .................................................................................................................................*.............. + // gap // ................................................................................................................................................ + mls v16.4S, v23.4S, v8.S[0] // ...........................................................................................................................*.................... + // gap // ................................................................................................................................................ + mul v23.4S, v17.4S, v3.S[0] // ..............................................................................................................................*................. + // gap // ................................................................................................................................................ + sqrdmulh v17.4S, v17.4S, v3.S[1] // ...............................................................................................................................*................ + // gap // ................................................................................................................................................ + add v0.4S, v0.4S, v26.4S // ..................................................................................................................................*............. + // gap // ................................................................................................................................................ + mul v26.4S, v20.4S, v3.S[0] // ...................................................................................................................................*............ + // gap // ................................................................................................................................................ + sqrdmulh v20.4S, v20.4S, v3.S[1] // ....................................................................................................................................*........... + // gap // ................................................................................................................................................ + mls v23.4S, v17.4S, v8.S[0] // ................................................................................................................................*............... + // gap // ................................................................................................................................................ + str q2, [x1, #-48] // .......................................................................................................................................*........ + // gap // ................................................................................................................................................ + ldr q2, [x5], #(12*16) // ..e............................................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v26.4S, v20.4S, v8.S[0] // .....................................................................................................................................*.......... + // gap // ................................................................................................................................................ + str q21, [x1, #-32] // ........................................................................................................................................*....... + // gap // ................................................................................................................................................ + ldr q21, [x5, #-176] // ...e............................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q0, [x1, #-16] // .........................................................................................................................................*...... + add x1, x1, #64 // ..............................................................................................................................................*. + ldr q0, [x5, #-160] // ....e........................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q16, [x2, #-48] // ...........................................................................................................................................*.... + // gap // ................................................................................................................................................ + ldr q16, [x5, #-144] // .....e.......................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q23, [x2, #-32] // ............................................................................................................................................*... + // gap // ................................................................................................................................................ + ldr q23, [x5, #-128] // ......e......................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q26, [x2, #-16] // .............................................................................................................................................*.. + add x2, x2, #64 // ...............................................................................................................................................* + ldr q26, [x5, #-112] // .......e........................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q20, [x5, #-96] // ............................e................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q17, [x5, #-80] // .............................e.................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q30, [x5, #-64] // ..............................e................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q25, [x5, #-48] // ...............................e................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q11, [x5, #-32] // ................................e............................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q7, [x5, #-16] // .................................e.............................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ld4 {v12.4S, v13.4S, v14.4S, v15.4S}, [x1] // e............................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x2] // .e.............................................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v10.4S, v12.4S, v13.4S // ........e....................................................................................................................................... + // gap // ................................................................................................................................................ + add v13.4S, v12.4S, v13.4S // .........e...................................................................................................................................... + // gap // ................................................................................................................................................ + sub v19.4S, v14.4S, v15.4S // .............e.................................................................................................................................. + // gap // ................................................................................................................................................ + mul v0.4S, v10.4S, v0.4S // ..........e..................................................................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v16.4S, v10.4S, v16.4S // ...........e.................................................................................................................................... + // gap // ................................................................................................................................................ + mul v23.4S, v19.4S, v23.4S // ...............e................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v26.4S, v19.4S, v26.4S // ................e............................................................................................................................... + // gap // ................................................................................................................................................ + add v10.4S, v14.4S, v15.4S // ..............e................................................................................................................................. + // gap // ................................................................................................................................................ + mls v0.4S, v16.4S, v8.S[0] // ............e................................................................................................................................... + // gap // ................................................................................................................................................ + sub v16.4S, v3.4S, v4.4S // ..................................e............................................................................................................. + // gap // ................................................................................................................................................ + mls v23.4S, v26.4S, v8.S[0] // .................e.............................................................................................................................. + // gap // ................................................................................................................................................ + sub v26.4S, v13.4S, v10.4S // ..................e............................................................................................................................. + // gap // ................................................................................................................................................ + add v13.4S, v13.4S, v10.4S // ...................e............................................................................................................................ + // gap // ................................................................................................................................................ + add v4.4S, v3.4S, v4.4S // ...................................e............................................................................................................ + // gap // ................................................................................................................................................ + sub v3.4S, v0.4S, v23.4S // .......................e........................................................................................................................ + // gap // ................................................................................................................................................ + mul v10.4S, v26.4S, v2.4S // ....................e........................................................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v26.4S, v26.4S, v21.4S // .....................e.......................................................................................................................... + // gap // ................................................................................................................................................ + mul v2.4S, v3.4S, v2.4S // .........................e...................................................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v21.4S, v3.4S, v21.4S // ..........................e..................................................................................................................... + // gap // ................................................................................................................................................ + add v0.4S, v0.4S, v23.4S // ........................e....................................................................................................................... + // gap // ................................................................................................................................................ + mls v10.4S, v26.4S, v8.S[0] // ......................e......................................................................................................................... + // gap // ................................................................................................................................................ + mul v23.4S, v16.4S, v30.4S // ....................................e........................................................................................................... + // gap // ................................................................................................................................................ + mls v2.4S, v21.4S, v8.S[0] // ...........................e.................................................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v16.4S, v16.4S, v25.4S // .....................................e.......................................................................................................... + // gap // ................................................................................................................................................ + sub v21.4S, v5.4S, v6.4S // .......................................e........................................................................................................ + // gap // ................................................................................................................................................ + add v26.4S, v5.4S, v6.4S // ........................................e....................................................................................................... + // gap // ................................................................................................................................................ + trn1 v30.4S, v13.4S, v0.4S // ......................................................e......................................................................................... + // gap // ................................................................................................................................................ + mls v23.4S, v16.4S, v8.S[0] // ......................................e......................................................................................................... + // gap // ................................................................................................................................................ + mul v16.4S, v21.4S, v11.4S // .........................................e...................................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v21.4S, v21.4S, v7.4S // ..........................................e..................................................................................................... + // gap // ................................................................................................................................................ + sub v25.4S, v4.4S, v26.4S // ............................................e................................................................................................... + // gap // ................................................................................................................................................ + add v26.4S, v4.4S, v26.4S // .............................................e.................................................................................................. + // gap // ................................................................................................................................................ + trn2 v0.4S, v13.4S, v0.4S // .......................................................e........................................................................................ + // gap // ................................................................................................................................................ + mls v16.4S, v21.4S, v8.S[0] // ...........................................e.................................................................................................... + // gap // ................................................................................................................................................ + mul v21.4S, v25.4S, v20.4S // ..............................................e................................................................................................. + // gap // ................................................................................................................................................ + sqrdmulh v4.4S, v25.4S, v17.4S // ...............................................e................................................................................................ + // gap // ................................................................................................................................................ + trn1 v25.4S, v10.4S, v2.4S // ........................................................e....................................................................................... + // gap // ................................................................................................................................................ + sub v11.4S, v23.4S, v16.4S // .................................................e.............................................................................................. + // gap // ................................................................................................................................................ + add v16.4S, v23.4S, v16.4S // ..................................................e............................................................................................. + // gap // ................................................................................................................................................ + mls v21.4S, v4.4S, v8.S[0] // ................................................e............................................................................................... + // gap // ................................................................................................................................................ + mul v23.4S, v11.4S, v20.4S // ...................................................e............................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v20.4S, v11.4S, v17.4S // ....................................................e........................................................................................... + // gap // ................................................................................................................................................ + trn2 v2.4S, v10.4S, v2.4S // .........................................................e...................................................................................... + // gap // ................................................................................................................................................ + trn2 v17.2D, v30.2D, v25.2D // ..........................................................e..................................................................................... + // gap // ................................................................................................................................................ + trn1 v30.2D, v30.2D, v25.2D // ............................................................e................................................................................... + // gap // ................................................................................................................................................ + mls v23.4S, v20.4S, v8.S[0] // .....................................................e.......................................................................................... + // gap // ................................................................................................................................................ + trn2 v20.2D, v0.2D, v2.2D // ...........................................................e.................................................................................... + // gap // ................................................................................................................................................ + trn1 v0.2D, v0.2D, v2.2D // .............................................................e.................................................................................. + // gap // ................................................................................................................................................ + trn1 v2.4S, v26.4S, v16.4S // ..............................................................e................................................................................. + // gap // ................................................................................................................................................ + trn1 v4.4S, v21.4S, v23.4S // ................................................................e............................................................................... + // gap // ................................................................................................................................................ + trn2 v16.4S, v26.4S, v16.4S // ...............................................................e................................................................................ + // gap // ................................................................................................................................................ + trn2 v23.4S, v21.4S, v23.4S // .................................................................e.............................................................................. + // gap // ................................................................................................................................................ + trn2 v25.2D, v2.2D, v4.2D // ..................................................................e............................................................................. + // gap // ................................................................................................................................................ + trn1 v21.2D, v2.2D, v4.2D // ....................................................................e........................................................................... + // gap // ................................................................................................................................................ + trn2 v4.2D, v16.2D, v23.2D // ...................................................................e............................................................................ + // gap // ................................................................................................................................................ + trn1 v16.2D, v16.2D, v23.2D // .....................................................................e.......................................................................... + // gap // ................................................................................................................................................ + add v2.4S, v25.4S, v4.4S // ..........................................................................................e..................................................... + // gap // ................................................................................................................................................ + add v23.4S, v21.4S, v16.4S // .....................................................................................e.......................................................... + // gap // ................................................................................................................................................ + add v11.4S, v30.4S, v0.4S // ...........................................................................e.................................................................... + // gap // ................................................................................................................................................ + add v13.4S, v17.4S, v20.4S // ................................................................................e............................................................... + // gap // ................................................................................................................................................ + sub v7.4S, v23.4S, v2.4S // ........................................................................................................e....................................... + // gap // ................................................................................................................................................ + add v2.4S, v23.4S, v2.4S // .........................................................................................................e...................................... + // gap // ................................................................................................................................................ + add v23.4S, v11.4S, v13.4S // ...............................................................................................e................................................ + // gap // ................................................................................................................................................ + ldr q3, [x4], #64 // ......................................................................e......................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v26.4S, v23.4S, v2.4S // ..................................................................................................................e............................. + // gap // ................................................................................................................................................ + ldr q10, [x4, #-48] // .......................................................................e........................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v19.4S, v26.4S, v3.S[0] // ....................................................................................................................e........................... + // gap // ................................................................................................................................................ + sqrdmulh v26.4S, v26.4S, v3.S[1] // .....................................................................................................................e.......................... + // gap // ................................................................................................................................................ + add v2.4S, v23.4S, v2.4S // ...................................................................................................................e............................ + // gap // ................................................................................................................................................ + ldr q1, [x4, #-32] // ........................................................................e....................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v19.4S, v26.4S, v8.S[0] // ......................................................................................................................e......................... + // gap // ................................................................................................................................................ + str q2, [x1], #(16*4) // ......................................................................................................................................e......... + // gap // ................................................................................................................................................ + ldr q6, [x4, #-16] // .........................................................................e...................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q19, [x2], #(16*4) // ..........................................................................................................................................e..... + // gap // ................................................................................................................................................ + + // original source code + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // ....................e...........................................................................|............................................................ + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // .....................e..........................................................................|............................................................ + // ldr q0, [x5], #(12*16) // e...............................................................................................|...............................................e............ + // ldr q4, [x5, #(-12*16 + 1*16)] // ...e............................................................................................|..................................................e......... + // ldr q1, [x5, #(-12*16 + 2*16)] // ......e.........................................................................................|.....................................................e...... + // ldr q5, [x5, #(-12*16 + 3*16)] // ........e.......................................................................................|.......................................................e.... + // ldr q2, [x5, #(-12*16 + 4*16)] // ..........e.....................................................................................|.........................................................e.. + // ldr q6, [x5, #(-12*16 + 5*16)] // .............e..................................................................................|............................................................ + // sub v24.4s, v9.4s, v10.4s // ......................e.........................................................................|............................................................ + // add v9.4s, v9.4s, v10.4s // .......................e........................................................................|............................................................ + // mul v10.4s, v24.4s, v1.4s // .........................e......................................................................|............................................................ + // sqrdmulh v24.4s, v24.4s, v5.4s // ..........................e.....................................................................|............................................................ + // mls v10.4s, v24.4s, v8.s[0] // ..............................e.................................................................|............................................................ + // sub v24.4s, v11.4s, v12.4s // ........................e.......................................................................|............................................................ + // add v11.4s, v11.4s, v12.4s // .............................e..................................................................|............................................................ + // mul v12.4s, v24.4s, v2.4s // ...........................e....................................................................|............................................................ + // sqrdmulh v24.4s, v24.4s, v6.4s // ............................e...................................................................|............................................................ + // mls v12.4s, v24.4s, v8.s[0] // ................................e...............................................................|............................................................ + // sub v24.4s, v9.4s, v11.4s // .................................e..............................................................|............................................................ + // add v9.4s, v9.4s, v11.4s // ..................................e.............................................................|............................................................ + // mul v11.4s, v24.4s, v0.4s // .....................................e..........................................................|............................................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // ......................................e.........................................................|............................................................ + // mls v11.4s, v24.4s, v8.s[0] // ..........................................e.....................................................|............................................................ + // sub v24.4s, v10.4s, v12.4s // ....................................e...........................................................|............................................................ + // add v10.4s, v10.4s, v12.4s // .........................................e......................................................|............................................................ + // mul v12.4s, v24.4s, v0.4s // .......................................e........................................................|............................................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // ........................................e.......................................................|............................................................ + // mls v12.4s, v24.4s, v8.s[0] // ............................................e...................................................|............................................................ + // ldr q0, [x5, #(-12*16 + 6*16)] // ..............e.................................................................................|............................................................ + // ldr q4, [x5, #(-12*16 + 7*16)] // ...............e................................................................................|............................................................ + // ldr q1, [x5, #(-12*16 + 8*16)] // ................e...............................................................................|............................................................ + // ldr q5, [x5, #(-12*16 + 9*16)] // .................e..............................................................................|............................................................ + // ldr q2, [x5, #(-12*16 + 10*16)] // ..................e.............................................................................|............................................................ + // ldr q6, [x5, #(-12*16 + 11*16)] // ...................e............................................................................|............................................................ + // sub v24.4s, v13.4s, v14.4s // ...............................e................................................................|............................................................ + // add v13.4s, v13.4s, v14.4s // ...................................e............................................................|............................................................ + // mul v14.4s, v24.4s, v1.4s // ...........................................e....................................................|............................................................ + // sqrdmulh v24.4s, v24.4s, v5.4s // .............................................e..................................................|............................................................ + // mls v14.4s, v24.4s, v8.s[0] // .................................................e..............................................|............................................................ + // sub v24.4s, v15.4s, v16.4s // ..............................................e.................................................|............................................................ + // add v15.4s, v15.4s, v16.4s // ...............................................e................................................|............................................................ + // mul v16.4s, v24.4s, v2.4s // ..................................................e.............................................|............................................................ + // sqrdmulh v24.4s, v24.4s, v6.4s // ...................................................e............................................|............................................................ + // mls v16.4s, v24.4s, v8.s[0] // .......................................................e........................................|............................................................ + // sub v24.4s, v13.4s, v15.4s // ....................................................e...........................................|............................................................ + // add v13.4s, v13.4s, v15.4s // .....................................................e..........................................|............................................................ + // mul v15.4s, v24.4s, v0.4s // ........................................................e.......................................|............................................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // .........................................................e......................................|............................................................ + // mls v15.4s, v24.4s, v8.s[0] // .............................................................e..................................|............................................................ + // sub v24.4s, v14.4s, v16.4s // ...........................................................e....................................|............................................................ + // add v14.4s, v14.4s, v16.4s // ............................................................e...................................|............................................................ + // mul v16.4s, v24.4s, v0.4s // ..............................................................e.................................|............................................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................................................e................................|............................................................ + // mls v16.4s, v24.4s, v8.s[0] // ...................................................................e............................|............................................................ + // trn1 v25.4s, v9.4s, v10.4s // ................................................e...............................................|............................................................ + // trn2 v26.4s, v9.4s, v10.4s // ......................................................e.........................................|............................................................ + // trn1 v27.4s, v11.4s, v12.4s // ..........................................................e.....................................|............................................................ + // trn2 v28.4s, v11.4s, v12.4s // ................................................................e...............................|............................................................ + // trn2 v11.2d, v25.2d, v27.2d // .................................................................e..............................|............................................................ + // trn2 v12.2d, v26.2d, v28.2d // ....................................................................e...........................|............................................................ + // trn1 v9.2d, v25.2d, v27.2d // ..................................................................e.............................|............................................................ + // trn1 v10.2d, v26.2d, v28.2d // .....................................................................e..........................|............................................................ + // trn1 v25.4s, v13.4s, v14.4s // ......................................................................e.........................|............................................................ + // trn2 v26.4s, v13.4s, v14.4s // ........................................................................e.......................|............................................................ + // trn1 v27.4s, v15.4s, v16.4s // .......................................................................e........................|............................................................ + // trn2 v28.4s, v15.4s, v16.4s // .........................................................................e......................|............................................................ + // trn2 v15.2d, v25.2d, v27.2d // ..........................................................................e.....................|............................................................ + // trn2 v16.2d, v26.2d, v28.2d // ............................................................................e...................|............................................................ + // trn1 v13.2d, v25.2d, v27.2d // ...........................................................................e....................|............................................................ + // trn1 v14.2d, v26.2d, v28.2d // .............................................................................e..................|............................................................ + // ldr q0, [x4], #64 // .....................................................................................e..........|............................................................ + // ldr q1, [x4, #(-64 + 16)] // .......................................................................................e........|............................................................ + // ldr q2, [x4, #(-64 + 32)] // ...........................................................................................e....|............................................................ + // ldr q3, [x4, #(-64 + 48)] // ..............................................................................................e.|............................................................ + // sub v24.4s, v9.4s, v10.4s // ................................................................................................*............................................................ + // add v9.4s, v9.4s, v10.4s // ................................................................................e...............|............................................................ + // mul v10.4s, v24.4s, v1.s[2] // ................................................................................................|..*......................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ................................................................................................|...*........................................................ + // mls v10.4s, v24.4s, v8.s[0] // ................................................................................................|........*................................................... + // sub v24.4s, v11.4s, v12.4s // ................................................................................................|*........................................................... + // add v11.4s, v11.4s, v12.4s // .................................................................................e..............|............................................................ + // mul v12.4s, v24.4s, v2.s[0] // ................................................................................................|....*....................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................................................................................................|.....*...................................................... + // mls v12.4s, v24.4s, v8.s[0] // ................................................................................................|.........*.................................................. + // sub v24.4s, v13.4s, v14.4s // ................................................................................................|.*.......................................................... + // add v13.4s, v13.4s, v14.4s // ...............................................................................e................|............................................................ + // mul v14.4s, v24.4s, v2.s[2] // ................................................................................................|......*..................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................................................................................................|.......*.................................................... + // mls v14.4s, v24.4s, v8.s[0] // ................................................................................................|...........*................................................ + // sub v24.4s, v15.4s, v16.4s // ................................................................................................|..........*................................................. + // add v15.4s, v15.4s, v16.4s // ..............................................................................e.................|............................................................ + // mul v16.4s, v24.4s, v3.s[0] // ................................................................................................|.............*.............................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................................................................................................|..............*............................................. + // mls v16.4s, v24.4s, v8.s[0] // ................................................................................................|...................*........................................ + // sub v24.4s, v9.4s, v11.4s // ................................................................................................|............*............................................... + // add v9.4s, v9.4s, v11.4s // ....................................................................................e...........|............................................................ + // mul v11.4s, v24.4s, v0.s[2] // ................................................................................................|...............*............................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................................................................................................|................*........................................... + // mls v11.4s, v24.4s, v8.s[0] // ................................................................................................|....................*....................................... + // sub v24.4s, v10.4s, v12.4s // ................................................................................................|.................*.......................................... + // add v10.4s, v10.4s, v12.4s // ................................................................................................|..................*......................................... + // mul v12.4s, v24.4s, v0.s[2] // ................................................................................................|.....................*...................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................................................................................................|......................*..................................... + // mls v12.4s, v24.4s, v8.s[0] // ................................................................................................|..........................*................................. + // sub v24.4s, v13.4s, v15.4s // ..................................................................................e.............|............................................................ + // add v13.4s, v13.4s, v15.4s // ...................................................................................e............|............................................................ + // mul v15.4s, v24.4s, v1.s[0] // ................................................................................................|.......................*.................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................................................................................................|...........................*................................ + // mls v15.4s, v24.4s, v8.s[0] // ................................................................................................|...............................*............................ + // sub v24.4s, v14.4s, v16.4s // ................................................................................................|........................*................................... + // add v14.4s, v14.4s, v16.4s // ................................................................................................|.........................*.................................. + // mul v16.4s, v24.4s, v1.s[0] // ................................................................................................|............................*............................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................................................................................................|.............................*.............................. + // mls v16.4s, v24.4s, v8.s[0] // ................................................................................................|.................................*.......................... + // sub v24.4s, v9.4s, v13.4s // ......................................................................................e.........|............................................................ + // add v9.4s, v9.4s, v13.4s // ..........................................................................................e.....|............................................................ + // mul v13.4s, v24.4s, v0.s[0] // ........................................................................................e.......|............................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................................................................................e......|............................................................ + // mls v13.4s, v24.4s, v8.s[0] // ............................................................................................e...|............................................................ + // sub v24.4s, v10.4s, v14.4s // ................................................................................................|..............................*............................. + // add v10.4s, v10.4s, v14.4s // ................................................................................................|................................*........................... + // mul v14.4s, v24.4s, v0.s[0] // ................................................................................................|..................................*......................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................................................................|...................................*........................ + // mls v14.4s, v24.4s, v8.s[0] // ................................................................................................|.......................................*.................... + // sub v24.4s, v11.4s, v15.4s // ................................................................................................|....................................*....................... + // add v11.4s, v11.4s, v15.4s // ................................................................................................|.....................................*...................... + // mul v15.4s, v24.4s, v0.s[0] // ................................................................................................|........................................*................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................................................................|.........................................*.................. + // mls v15.4s, v24.4s, v8.s[0] // ................................................................................................|.............................................*.............. + // sub v24.4s, v12.4s, v16.4s // ................................................................................................|......................................*..................... + // add v12.4s, v12.4s, v16.4s // ................................................................................................|..........................................*................. + // mul v16.4s, v24.4s, v0.s[0] // ................................................................................................|...........................................*................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................................................................|............................................*............... + // mls v16.4s, v24.4s, v8.s[0] // .*..............................................................................................|................................................*........... + // str q9, [x1], #(16*4) // .............................................................................................e..|............................................................ + // str q10, [x1, #(-16*4 + 1*16)] // ................................................................................................|..............................................*............. + // str q11, [x1, #(-16*4 + 2*16)] // ..*.............................................................................................|.................................................*.......... + // str q12, [x1, #(-16*4 + 3*16)] // ....*...........................................................................................|...................................................*........ + // str q13, [x2], #(16*4) // ...............................................................................................e|............................................................ + // str q14, [x2, #(-16*4 + 1*16)] // .......*........................................................................................|......................................................*..... + // str q15, [x2, #(-16*4 + 2*16)] // .........*......................................................................................|........................................................*... + // str q16, [x2, #(-16*4 + 3*16)] // ...........*....................................................................................|..........................................................*. + // add x1, x1, #64 // .....*..........................................................................................|....................................................*....... + // add x2, x2, #64 // ............*...................................................................................|...........................................................* + + sub count, count, #1 + cbnz count, layer45678_start + sub v29.4S, v21.4S, v16.4S // ..*..................................................... + // gap // ........................................................ + sub v19.4S, v25.4S, v4.4S // ...........*............................................ + // gap // ........................................................ + sub v26.4S, v17.4S, v20.4S // .*...................................................... + // gap // ........................................................ + mul v21.4S, v29.4S, v1.S[2] // .......*................................................ + // gap // ........................................................ + sqrdmulh v16.4S, v29.4S, v1.S[3] // ........*............................................... + // gap // ........................................................ + sqrdmulh v23.4S, v19.4S, v6.S[1] // ...............*........................................ + // gap // ........................................................ + sub v29.4S, v30.4S, v0.4S // *....................................................... + // gap // ........................................................ + mul v0.4S, v19.4S, v6.S[0] // ..............*......................................... + // gap // ........................................................ + sqrdmulh v20.4S, v26.4S, v1.S[1] // ......*................................................. + // gap // ........................................................ + mul v4.4S, v29.4S, v10.S[2] // ...*.................................................... + // gap // ........................................................ + sqrdmulh v27.4S, v29.4S, v10.S[3] // ....*................................................... + // gap // ........................................................ + mul v6.4S, v26.4S, v1.S[0] // .....*.................................................. + // gap // ........................................................ + mls v21.4S, v16.4S, v8.S[0] // ............*........................................... + // gap // ........................................................ + mls v0.4S, v23.4S, v8.S[0] // ....................*................................... + // gap // ........................................................ + mls v4.4S, v27.4S, v8.S[0] // .........*.............................................. + // gap // ........................................................ + mls v6.4S, v20.4S, v8.S[0] // ..........*............................................. + // gap // ........................................................ + sub v20.4S, v11.4S, v13.4S // .............*.......................................... + // gap // ........................................................ + sub v2.4S, v21.4S, v0.4S // .........................*.............................. + // gap // ........................................................ + add v17.4S, v21.4S, v0.4S // ..........................*............................. + // gap // ........................................................ + sub v23.4S, v4.4S, v6.4S // ..................*..................................... + // gap // ........................................................ + mul v26.4S, v2.4S, v10.S[0] // .............................*.......................... + // gap // ........................................................ + sqrdmulh v30.4S, v2.4S, v10.S[1] // ..............................*......................... + // gap // ........................................................ + sqrdmulh v0.4S, v23.4S, v3.S[3] // .......................*................................ + // gap // ........................................................ + mul v1.4S, v23.4S, v3.S[2] // ......................*................................. + // gap // ........................................................ + add v14.4S, v4.4S, v6.4S // ...................*.................................... + // gap // ........................................................ + sqrdmulh v11.4S, v20.4S, v3.S[3] // .................*...................................... + // gap // ........................................................ + mls v26.4S, v30.4S, v8.S[0] // ..................................*..................... + // gap // ........................................................ + mls v1.4S, v0.4S, v8.S[0] // ...........................*............................ + // gap // ........................................................ + mul v30.4S, v7.4S, v10.S[0] // ........................*............................... + // gap // ........................................................ + sub v21.4S, v14.4S, v17.4S // ...............................*........................ + // gap // ........................................................ + sqrdmulh v25.4S, v7.4S, v10.S[1] // ............................*........................... + // gap // ........................................................ + sub v16.4S, v1.4S, v26.4S // .......................................*................ + // gap // ........................................................ + mul v23.4S, v21.4S, v3.S[0] // ...................................*.................... + // gap // ........................................................ + mul v13.4S, v20.4S, v3.S[2] // ................*....................................... + // gap // ........................................................ + sqrdmulh v2.4S, v16.4S, v3.S[1] // .............................................*.......... + // gap // ........................................................ + mul v0.4S, v16.4S, v3.S[0] // ............................................*........... + // gap // ........................................................ + sqrdmulh v20.4S, v21.4S, v3.S[1] // ....................................*................... + // gap // ........................................................ + mls v30.4S, v25.4S, v8.S[0] // ................................*....................... + // gap // ........................................................ + mls v13.4S, v11.4S, v8.S[0] // .....................*.................................. + // gap // ........................................................ + mls v0.4S, v2.4S, v8.S[0] // ................................................*....... + // gap // ........................................................ + add v2.4S, v1.4S, v26.4S // ...........................................*............ + // gap // ........................................................ + mls v23.4S, v20.4S, v8.S[0] // ........................................*............... + // gap // ........................................................ + sub v16.4S, v13.4S, v30.4S // .....................................*.................. + // gap // ........................................................ + str q0, [x2, #-16] // ......................................................*. + // gap // ........................................................ + add v0.4S, v13.4S, v30.4S // ......................................*................. + // gap // ........................................................ + str q23, [x2, #-48] // ....................................................*... + // gap // ........................................................ + sqrdmulh v23.4S, v16.4S, v3.S[1] // ..........................................*............. + // gap // ........................................................ + mul v30.4S, v16.4S, v3.S[0] // .........................................*.............. + // gap // ........................................................ + str q0, [x1, #-32] // .................................................*...... + // gap // ........................................................ + add v0.4S, v14.4S, v17.4S // .................................*...................... + // gap // ........................................................ + str q2, [x1, #-16] // ..................................................*..... + // gap // ........................................................ + mls v30.4S, v23.4S, v8.S[0] // ..............................................*......... + // gap // ........................................................ + str q0, [x1, #-48] // ...............................................*........ + add x1, x1, #64 // ...................................................*.... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + str q30, [x2, #-32] // .....................................................*.. + add x2, x2, #64 // .......................................................* + + // original source code + // sub v0.4S, v30.4S, v0.4S // ......*................................................. + // sub v2.4S, v17.4S, v20.4S // ..*..................................................... + // sub v16.4S, v21.4S, v16.4S // *....................................................... + // mul v23.4S, v0.4S, v10.S[2] // .........*.............................................. + // sqrdmulh v0.4S, v0.4S, v10.S[3] // ..........*............................................. + // mul v21.4S, v2.4S, v1.S[0] // ...........*............................................ + // sqrdmulh v2.4S, v2.4S, v1.S[1] // ........*............................................... + // mul v26.4S, v16.4S, v1.S[2] // ...*.................................................... + // sqrdmulh v16.4S, v16.4S, v1.S[3] // ....*................................................... + // mls v23.4S, v0.4S, v8.S[0] // ..............*......................................... + // mls v21.4S, v2.4S, v8.S[0] // ...............*........................................ + // sub v0.4S, v25.4S, v4.4S // .*...................................................... + // mls v26.4S, v16.4S, v8.S[0] // ............*........................................... + // sub v2.4S, v11.4S, v13.4S // ................*....................................... + // mul v16.4S, v0.4S, v6.S[0] // .......*................................................ + // sqrdmulh v0.4S, v0.4S, v6.S[1] // .....*.................................................. + // mul v20.4S, v2.4S, v3.S[2] // .................................*...................... + // sqrdmulh v2.4S, v2.4S, v3.S[3] // .........................*.............................. + // sub v17.4S, v23.4S, v21.4S // ...................*.................................... + // add v23.4S, v23.4S, v21.4S // ........................*............................... + // mls v16.4S, v0.4S, v8.S[0] // .............*.......................................... + // mls v20.4S, v2.4S, v8.S[0] // ......................................*................. + // mul v0.4S, v17.4S, v3.S[2] // .......................*................................ + // sqrdmulh v2.4S, v17.4S, v3.S[3] // ......................*................................. + // mul v21.4S, v7.4S, v10.S[0] // ............................*........................... + // sub v17.4S, v26.4S, v16.4S // .................*...................................... + // add v16.4S, v26.4S, v16.4S // ..................*..................................... + // mls v0.4S, v2.4S, v8.S[0] // ...........................*............................ + // sqrdmulh v2.4S, v7.4S, v10.S[1] // ..............................*......................... + // mul v26.4S, v17.4S, v10.S[0] // ....................*................................... + // sqrdmulh v17.4S, v17.4S, v10.S[1] // .....................*.................................. + // sub v30.4S, v23.4S, v16.4S // .............................*.......................... + // mls v21.4S, v2.4S, v8.S[0] // .....................................*.................. + // add v2.4S, v23.4S, v16.4S // .................................................*...... + // mls v26.4S, v17.4S, v8.S[0] // ..........................*............................. + // mul v16.4S, v30.4S, v3.S[0] // ................................*....................... + // sqrdmulh v23.4S, v30.4S, v3.S[1] // ....................................*................... + // sub v17.4S, v20.4S, v21.4S // ..........................................*............. + // add v21.4S, v20.4S, v21.4S // ............................................*........... + // sub v20.4S, v0.4S, v26.4S // ...............................*........................ + // mls v16.4S, v23.4S, v8.S[0] // .........................................*.............. + // mul v23.4S, v17.4S, v3.S[0] // ...............................................*........ + // sqrdmulh v17.4S, v17.4S, v3.S[1] // ..............................................*......... + // add v0.4S, v0.4S, v26.4S // ........................................*............... + // mul v26.4S, v20.4S, v3.S[0] // ...................................*.................... + // sqrdmulh v20.4S, v20.4S, v3.S[1] // ..................................*..................... + // mls v23.4S, v17.4S, v8.S[0] // ...................................................*.... + // str q2, [x1, #-48] // ....................................................*... + // mls v26.4S, v20.4S, v8.S[0] // .......................................*................ + // str q21, [x1, #-32] // ................................................*....... + // str q0, [x1, #-16] // ..................................................*..... + // add x1, x1, #64 // .....................................................*.. + // str q16, [x2, #-48] // .............................................*.......... + // str q23, [x2, #-32] // ......................................................*. + // str q26, [x2, #-16] // ...........................................*............ + // add x2, x2, #64 // .......................................................* + + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + ldr q13, [x0, #768] // .....*...... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q7, [x0, #896] // .......*.... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q4, [x0, #512] // ...*........ + // gap // ............ + // gap // ............ + // gap // ............ + ldr q11, [x0, #640] // ....*....... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q20, [x0, #384] // ........*... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q21, [x0, #256] // ..*......... + // gap // ............ + // gap // ............ + // gap // ............ + add v19.4S, v4.4S, v11.4S // ......*..... + // gap // ............ + add v6.4S, v13.4S, v7.4S // .........*.. + // gap // ............ + ldr q17, [x0, #0] // *........... + // gap // ............ + // gap // ............ + // gap // ............ + add v30.4S, v21.4S, v20.4S // ..........*. + // gap // ............ + add v10.4S, v19.4S, v6.4S // ...........* + // gap // ............ + ldr q23, [x0, #128] // .*.......... + // gap // ............ + + // original source code + // ldr q17, [x0, #0] // ........*... + // ldr q23, [x0, #128] // ...........* + // ldr q21, [x0, #256] // .....*...... + // ldr q4, [x0, #512] // ..*......... + // ldr q11, [x0, #640] // ...*........ + // ldr q13, [x0, #768] // *........... + // add v19.4S, v4.4S, v11.4S // ......*..... + // ldr q7, [x0, #896] // .*.......... + // ldr q20, [x0, #384] // ....*....... + // add v6.4S, v13.4S, v7.4S // .......*.... + // add v30.4S, v21.4S, v20.4S // .........*.. + // add v10.4S, v19.4S, v6.4S // ..........*. + + sub count, count, #1 +layer123_start: + sub v16.4S, v17.4S, v23.4S // ........*....................................................................................... + // gap // ................................................................................................ + add v23.4S, v17.4S, v23.4S // .........*...................................................................................... + // gap // ................................................................................................ + sub v21.4S, v21.4S, v20.4S // .............*.................................................................................. + // gap // ................................................................................................ + mul v20.4S, v16.4S, v1.S[2] // ..........*..................................................................................... + // gap // ................................................................................................ + sqrdmulh v16.4S, v16.4S, v1.S[3] // ...........*.................................................................................... + // gap // ................................................................................................ + sub v17.4S, v23.4S, v30.4S // ............................*................................................................... + // gap // ................................................................................................ + add v23.4S, v23.4S, v30.4S // .............................*.................................................................. + // gap // ................................................................................................ + mul v30.4S, v21.4S, v2.S[0] // ...............*................................................................................ + // gap // ................................................................................................ + sqrdmulh v21.4S, v21.4S, v2.S[1] // ................*............................................................................... + // gap // ................................................................................................ + mls v20.4S, v16.4S, v8.S[0] // ............*................................................................................... + // gap // ................................................................................................ + sub v16.4S, v4.4S, v11.4S // ..................*............................................................................. + // gap // ................................................................................................ + mul v4.4S, v17.4S, v0.S[2] // ..............................*................................................................. + // gap // ................................................................................................ + sqrdmulh v17.4S, v17.4S, v0.S[3] // ...............................*................................................................ + // gap // ................................................................................................ + sub v11.4S, v23.4S, v10.4S // ................................................*............................................... + // gap // ................................................................................................ + add v23.4S, v23.4S, v10.4S // .................................................*.............................................. + // gap // ................................................................................................ + mls v30.4S, v21.4S, v8.S[0] // .................*.............................................................................. + // gap // ................................................................................................ + mul v21.4S, v16.4S, v2.S[2] // ....................*........................................................................... + // gap // ................................................................................................ + sqrdmulh v16.4S, v16.4S, v2.S[3] // .....................*.......................................................................... + // gap // ................................................................................................ + sub v13.4S, v13.4S, v7.4S // .......................*........................................................................ + // gap // ................................................................................................ + sub v7.4S, v20.4S, v30.4S // .................................*.............................................................. + // gap // ................................................................................................ + add v20.4S, v20.4S, v30.4S // ..................................*............................................................. + // gap // ................................................................................................ + mls v21.4S, v16.4S, v8.S[0] // ......................*......................................................................... + // gap // ................................................................................................ + mul v16.4S, v13.4S, v3.S[0] // .........................*...................................................................... + // gap // ................................................................................................ + mls v4.4S, v17.4S, v8.S[0] // ................................*............................................................... + // gap // ................................................................................................ + sqrdmulh v17.4S, v13.4S, v3.S[1] // ..........................*..................................................................... + // gap // ................................................................................................ + mul v30.4S, v7.4S, v0.S[2] // ...................................*............................................................ + // gap // ................................................................................................ + sqrdmulh v13.4S, v7.4S, v0.S[3] // ....................................*........................................................... + // gap // ................................................................................................ + mul v7.4S, v11.4S, v0.S[0] // ..................................................*............................................. + // gap // ................................................................................................ + sqrdmulh v11.4S, v11.4S, v0.S[1] // ...................................................*............................................ + // gap // ................................................................................................ + mul v10.4S, v23.4S, v25.4S // ................................................................................*............... + // gap // ................................................................................................ + sqrdmulh v23.4S, v23.4S, v26.4S // .................................................................................*.............. + // gap // ................................................................................................ + mls v16.4S, v17.4S, v8.S[0] // ...........................*.................................................................... + // gap // ................................................................................................ + mls v30.4S, v13.4S, v8.S[0] // .....................................*.......................................................... + // gap // ................................................................................................ + sub v17.4S, v19.4S, v6.4S // ......................................*......................................................... + // gap // ................................................................................................ + mls v7.4S, v11.4S, v8.S[0] // ....................................................*........................................... + // gap // ................................................................................................ + sub v11.4S, v21.4S, v16.4S // ...........................................*.................................................... + // gap // ................................................................................................ + mul v13.4S, v17.4S, v1.S[0] // ........................................*....................................................... + // gap // ................................................................................................ + sqrdmulh v17.4S, v17.4S, v1.S[1] // .........................................*...................................................... + // gap // ................................................................................................ + add v16.4S, v21.4S, v16.4S // ............................................*................................................... + // gap // ................................................................................................ + mul v21.4S, v11.4S, v1.S[0] // .............................................*.................................................. + // gap // ................................................................................................ + sqrdmulh v11.4S, v11.4S, v1.S[1] // ..............................................*................................................. + // gap // ................................................................................................ + sub v19.4S, v20.4S, v16.4S // .....................................................*.......................................... + // gap // ................................................................................................ + add v16.4S, v20.4S, v16.4S // ......................................................*......................................... + // gap // ................................................................................................ + mls v13.4S, v17.4S, v8.S[0] // ..........................................*..................................................... + // gap // ................................................................................................ + mls v21.4S, v11.4S, v8.S[0] // ...............................................*................................................ + // gap // ................................................................................................ + mul v20.4S, v19.4S, v0.S[0] // .......................................................*........................................ + // gap // ................................................................................................ + sqrdmulh v17.4S, v19.4S, v0.S[1] // ........................................................*....................................... + // gap // ................................................................................................ + sub v11.4S, v4.4S, v13.4S // ..........................................................*..................................... + // gap // ................................................................................................ + add v4.4S, v4.4S, v13.4S // ...........................................................*.................................... + // gap // ................................................................................................ + sub v13.4S, v30.4S, v21.4S // ...............................................................*................................ + // gap // ................................................................................................ + mls v20.4S, v17.4S, v8.S[0] // .........................................................*...................................... + // gap // ................................................................................................ + mul v17.4S, v11.4S, v0.S[0] // ............................................................*................................... + // gap // ................................................................................................ + sqrdmulh v11.4S, v11.4S, v0.S[1] // .............................................................*.................................. + // gap // ................................................................................................ + add v21.4S, v30.4S, v21.4S // ................................................................*............................... + // gap // ................................................................................................ + mul v30.4S, v13.4S, v0.S[0] // .................................................................*.............................. + // gap // ................................................................................................ + sqrdmulh v13.4S, v13.4S, v0.S[1] // ..................................................................*............................. + // gap // ................................................................................................ + mls v17.4S, v11.4S, v8.S[0] // ..............................................................*................................. + // gap // ................................................................................................ + srshr v11.4S, v7.4S, #23 // ....................................................................*........................... + // gap // ................................................................................................ + srshr v19.4S, v20.4S, #23 // ......................................................................*......................... + // gap // ................................................................................................ + mls v10.4S, v23.4S, v8.S[0] // ..................................................................................*............. + // gap // ................................................................................................ + mls v30.4S, v13.4S, v8.S[0] // ...................................................................*............................ + // gap // ................................................................................................ + mls v7.4S, v11.4S, v8.4S // .....................................................................*.......................... + // gap // ................................................................................................ + mls v20.4S, v19.4S, v8.4S // .......................................................................*........................ + // gap // ................................................................................................ + srshr v23.4S, v17.4S, #23 // ........................................................................*....................... + // gap // ................................................................................................ + srshr v11.4S, v30.4S, #23 // ..........................................................................*..................... + // gap // ................................................................................................ + str q7, [x0, #512] // ............................................................................*................... + // gap // ................................................................................................ + mls v17.4S, v23.4S, v8.4S // .........................................................................*...................... + // gap // ................................................................................................ + mls v30.4S, v11.4S, v8.4S // ...........................................................................*.................... + // gap // ................................................................................................ + str q20, [x0, #640] // .............................................................................*.................. + // gap // ................................................................................................ + mul v23.4S, v16.4S, v25.4S // ...................................................................................*............ + // gap // ................................................................................................ + str q17, [x0, #768] // ..............................................................................*................. + // gap // ................................................................................................ + sqrdmulh v16.4S, v16.4S, v26.4S // ....................................................................................*........... + // gap // ................................................................................................ + str q30, [x0, #896] // ...............................................................................*................ + // gap // ................................................................................................ + mul v20.4S, v4.4S, v25.4S // ......................................................................................*......... + // gap // ................................................................................................ + sqrdmulh v17.4S, v4.4S, v26.4S // .......................................................................................*........ + // gap // ................................................................................................ + mls v23.4S, v16.4S, v8.S[0] // .....................................................................................*.......... + // gap // ................................................................................................ + mul v16.4S, v21.4S, v25.4S // .........................................................................................*...... + // gap // ................................................................................................ + sqrdmulh v21.4S, v21.4S, v26.4S // ..........................................................................................*..... + // gap // ................................................................................................ + mls v20.4S, v17.4S, v8.S[0] // ........................................................................................*....... + // gap // ................................................................................................ + str q10, [x0], #(16) // ............................................................................................*... + // gap // ................................................................................................ + ldr q17, [x0, #0] // e............................................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v16.4S, v21.4S, v8.S[0] // ...........................................................................................*.... + // gap // ................................................................................................ + str q23, [x0, #112] // .............................................................................................*.. + // gap // ................................................................................................ + ldr q23, [x0, #128] // .e.............................................................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + str q20, [x0, #240] // ..............................................................................................*. + // gap // ................................................................................................ + ldr q21, [x0, #256] // ..e............................................................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + ldr q4, [x0, #512] // ....e........................................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + ldr q11, [x0, #640] // .....e.......................................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + ldr q13, [x0, #768] // ......e......................................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + add v19.4S, v4.4S, v11.4S // ...................e............................................................................ + // gap // ................................................................................................ + ldr q7, [x0, #896] // .......e........................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + ldr q20, [x0, #384] // ...e............................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + add v6.4S, v13.4S, v7.4S // ........................e....................................................................... + // gap // ................................................................................................ + str q16, [x0, #368] // ...............................................................................................* + // gap // ................................................................................................ + add v30.4S, v21.4S, v20.4S // ..............e................................................................................. + // gap // ................................................................................................ + add v10.4S, v19.4S, v6.4S // .......................................e........................................................ + // gap // ................................................................................................ + + // original source code + // ldr q9, [x0, #0] // e...............|...............................................................................e............. + // ldr q10, [x0, #(1*(1024/8))] // ...e............|..................................................................................e.......... + // ldr q11, [x0, #(2*(1024/8))] // .....e..........|....................................................................................e........ + // ldr q12, [x0, #(3*(1024/8))] // ...........e....|..........................................................................................e.. + // ldr q13, [x0, #(4*(1024/8))] // ......e.........|.....................................................................................e....... + // ldr q14, [x0, #(5*(1024/8))] // .......e........|......................................................................................e...... + // ldr q15, [x0, #(6*(1024/8))] // ........e.......|.......................................................................................e..... + // ldr q16, [x0, #(7*(1024/8))] // ..........e.....|.........................................................................................e... + // sub v24.4s, v9.4s, v10.4s // ................*............................................................................................. + // add v9.4s, v9.4s, v10.4s // ................|*............................................................................................ + // mul v10.4s, v24.4s, v1.s[2] // ................|..*.......................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ................|...*......................................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ................|........*.................................................................................... + // sub v24.4s, v11.4s, v12.4s // ................|.*........................................................................................... + // add v11.4s, v11.4s, v12.4s // ..............e.|............................................................................................. + // mul v12.4s, v24.4s, v2.s[0] // ................|......*...................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................|.......*..................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ................|..............*.............................................................................. + // sub v24.4s, v13.4s, v14.4s // ................|.........*................................................................................... + // add v13.4s, v13.4s, v14.4s // .........e......|........................................................................................e.... + // mul v14.4s, v24.4s, v2.s[2] // ................|...............*............................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................|................*............................................................................ + // mls v14.4s, v24.4s, v8.s[0] // ................|....................*........................................................................ + // sub v24.4s, v15.4s, v16.4s // ................|.................*........................................................................... + // add v15.4s, v15.4s, v16.4s // ............e...|...........................................................................................e. + // mul v16.4s, v24.4s, v3.s[0] // ................|.....................*....................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................|.......................*..................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ................|..............................*.............................................................. + // sub v24.4s, v9.4s, v11.4s // ................|....*........................................................................................ + // add v9.4s, v9.4s, v11.4s // ................|.....*....................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ................|..........*.................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|...........*................................................................................. + // mls v11.4s, v24.4s, v8.s[0] // ................|......................*...................................................................... + // sub v24.4s, v10.4s, v12.4s // ................|..................*.......................................................................... + // add v10.4s, v10.4s, v12.4s // ................|...................*......................................................................... + // mul v12.4s, v24.4s, v0.s[2] // ................|........................*.................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|.........................*................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ................|...............................*............................................................. + // sub v24.4s, v13.4s, v15.4s // ................|................................*............................................................ + // add v13.4s, v13.4s, v15.4s // ...............e|............................................................................................. + // mul v15.4s, v24.4s, v1.s[0] // ................|...................................*......................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|....................................*........................................................ + // mls v15.4s, v24.4s, v8.s[0] // ................|..........................................*.................................................. + // sub v24.4s, v14.4s, v16.4s // ................|..................................*.......................................................... + // add v14.4s, v14.4s, v16.4s // ................|.....................................*....................................................... + // mul v16.4s, v24.4s, v1.s[0] // ................|......................................*...................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|.......................................*..................................................... + // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................*................................................. + // sub v24.4s, v9.4s, v13.4s // ................|............*................................................................................ + // add v9.4s, v9.4s, v13.4s // ................|.............*............................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ................|..........................*.................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...........................*................................................................. + // mls v13.4s, v24.4s, v8.s[0] // ................|.................................*........................................................... + // sub v24.4s, v10.4s, v14.4s // ................|........................................*.................................................... + // add v10.4s, v10.4s, v14.4s // ................|.........................................*................................................... + // mul v14.4s, v24.4s, v0.s[0] // ................|............................................*................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|.............................................*............................................... + // mls v14.4s, v24.4s, v8.s[0] // ................|.................................................*........................................... + // sub v24.4s, v11.4s, v15.4s // ................|..............................................*.............................................. + // add v11.4s, v11.4s, v15.4s // ................|...............................................*............................................. + // mul v15.4s, v24.4s, v0.s[0] // ................|..................................................*.......................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...................................................*......................................... + // mls v15.4s, v24.4s, v8.s[0] // ................|.......................................................*..................................... + // sub v24.4s, v12.4s, v16.4s // ................|................................................*............................................ + // add v12.4s, v12.4s, v16.4s // ................|....................................................*........................................ + // mul v16.4s, v24.4s, v0.s[0] // ................|.....................................................*....................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|......................................................*...................................... + // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................................*................................. + // srshr v24.4S, v13.4S, #23 // ................|........................................................*.................................... + // mls v13.4s, v24.4s, v8.4s // ................|............................................................*................................ + // srshr v24.4S, v14.4S, #23 // ................|.........................................................*................................... + // mls v14.4s, v24.4s, v8.4s // ................|.............................................................*............................... + // srshr v24.4S, v15.4S, #23 // ................|..............................................................*.............................. + // mls v15.4s, v24.4s, v8.4s // ................|.................................................................*........................... + // srshr v24.4S, v16.4S, #23 // ................|...............................................................*............................. + // mls v16.4s, v24.4s, v8.4s // ................|..................................................................*.......................... + // str q13, [x0, #(4*(1024/8))] // ................|................................................................*............................ + // str q14, [x0, #(5*(1024/8))] // ................|...................................................................*......................... + // str q15, [x0, #(6*(1024/8))] // ................|.....................................................................*....................... + // str q16, [x0, #(7*(1024/8))] // ................|.......................................................................*..................... + // mul v13.4s, v9.4s, v25.4s // ................|............................*................................................................ + // sqrdmulh v9.4s, v9.4s, v26.4s // ................|.............................*............................................................... + // mls v13.4s, v9.4s, v8.s[0] // ................|..........................................................*.................................. + // mul v14.4s, v10.4s, v25.4s // ................|....................................................................*........................ + // sqrdmulh v10.4s, v10.4s, v26.4s // ................|......................................................................*...................... + // mls v14.4s, v10.4s, v8.s[0] // ................|..........................................................................*.................. + // mul v15.4s, v11.4s, v25.4s // ................|........................................................................*.................... + // sqrdmulh v11.4s, v11.4s, v26.4s // ................|.........................................................................*................... + // mls v15.4s, v11.4s, v8.s[0] // ................|.............................................................................*............... + // mul v16.4s, v12.4s, v25.4s // ................|...........................................................................*................. + // sqrdmulh v12.4s, v12.4s, v26.4s // ................|............................................................................*................ + // mls v16.4s, v12.4s, v8.s[0] // .*..............|................................................................................*............ + // str q13, [x0], #(16) // ................|..............................................................................*.............. + // str q14, [x0, #(-16 + 1*(1024/8))] // ..*.............|.................................................................................*........... + // str q15, [x0, #(-16 + 2*(1024/8))] // ....*...........|...................................................................................*......... + // str q16, [x0, #(-16 + 3*(1024/8))] // .............*..|............................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + sub v27.4S, v4.4S, v11.4S // ..........*......................................................................... + // gap // .................................................................................... + sub v22.4S, v17.4S, v23.4S // *................................................................................... + // gap // .................................................................................... + sub v16.4S, v13.4S, v7.4S // ..................*................................................................. + // gap // .................................................................................... + mul v7.4S, v27.4S, v2.S[2] // ................*................................................................... + // gap // .................................................................................... + mul v29.4S, v22.4S, v1.S[2] // ...*................................................................................ + // gap // .................................................................................... + sqrdmulh v13.4S, v22.4S, v1.S[3] // ....*............................................................................... + // gap // .................................................................................... + sqrdmulh v5.4S, v16.4S, v3.S[1] // ........................*........................................................... + // gap // .................................................................................... + sub v18.4S, v21.4S, v20.4S // ..*................................................................................. + // gap // .................................................................................... + sqrdmulh v4.4S, v27.4S, v2.S[3] // .................*.................................................................. + // gap // .................................................................................... + mul v14.4S, v16.4S, v3.S[0] // ......................*............................................................. + // gap // .................................................................................... + mul v11.4S, v18.4S, v2.S[0] // .......*............................................................................ + // gap // .................................................................................... + sqrdmulh v16.4S, v18.4S, v2.S[1] // ........*........................................................................... + // gap // .................................................................................... + mls v7.4S, v4.4S, v8.S[0] // .....................*.............................................................. + // gap // .................................................................................... + mls v14.4S, v5.4S, v8.S[0] // ...............................*.................................................... + // gap // .................................................................................... + mls v29.4S, v13.4S, v8.S[0] // .........*.......................................................................... + // gap // .................................................................................... + mls v11.4S, v16.4S, v8.S[0] // ...............*.................................................................... + // gap // .................................................................................... + sub v16.4S, v19.4S, v6.4S // .................................*.................................................. + // gap // .................................................................................... + sub v9.4S, v7.4S, v14.4S // ...................................*................................................ + // gap // .................................................................................... + add v27.4S, v7.4S, v14.4S // ......................................*............................................. + // gap // .................................................................................... + add v5.4S, v29.4S, v11.4S // ....................*............................................................... + // gap // .................................................................................... + sqrdmulh v20.4S, v9.4S, v1.S[1] // ........................................*........................................... + // gap // .................................................................................... + mul v6.4S, v9.4S, v1.S[0] // .......................................*............................................ + // gap // .................................................................................... + sub v21.4S, v5.4S, v27.4S // .........................................*.......................................... + // gap // .................................................................................... + add v22.4S, v5.4S, v27.4S // ..........................................*......................................... + // gap // .................................................................................... + sub v27.4S, v29.4S, v11.4S // ...................*................................................................ + // gap // .................................................................................... + sqrdmulh v7.4S, v21.4S, v0.S[1] // ..............................................*..................................... + // gap // .................................................................................... + mul v4.4S, v21.4S, v0.S[0] // .............................................*...................................... + // gap // .................................................................................... + mls v6.4S, v20.4S, v8.S[0] // ............................................*....................................... + // gap // .................................................................................... + sqrdmulh v11.4S, v27.4S, v0.S[3] // ..........................*......................................................... + // gap // .................................................................................... + sqrdmulh v28.4S, v22.4S, v26.4S // .......................................................................*............ + // gap // .................................................................................... + mls v4.4S, v7.4S, v8.S[0] // ..................................................*................................. + // gap // .................................................................................... + mul v13.4S, v27.4S, v0.S[2] // .........................*.......................................................... + // gap // .................................................................................... + sqrdmulh v7.4S, v16.4S, v1.S[1] // .....................................*.............................................. + // gap // .................................................................................... + add v27.4S, v17.4S, v23.4S // .*.................................................................................. + // gap // .................................................................................... + srshr v15.4S, v4.4S, #23 // ..........................................................*......................... + // gap // .................................................................................... + mls v13.4S, v11.4S, v8.S[0] // ................................*................................................... + // gap // .................................................................................... + mul v11.4S, v22.4S, v25.4S // .....................................................................*.............. + // gap // .................................................................................... + mls v4.4S, v15.4S, v8.4S // ..............................................................*..................... + // gap // .................................................................................... + sub v29.4S, v27.4S, v30.4S // .....*.............................................................................. + // gap // .................................................................................... + sub v20.4S, v13.4S, v6.4S // .................................................*.................................. + // gap // .................................................................................... + mls v11.4S, v28.4S, v8.S[0] // ...........................................................................*........ + // gap // .................................................................................... + str q4, [x0, #640] // ....................................................................*............... + // gap // .................................................................................... + sqrdmulh v14.4S, v20.4S, v0.S[1] // .......................................................*............................ + // gap // .................................................................................... + mul v21.4S, v20.4S, v0.S[0] // ......................................................*............................. + // gap // .................................................................................... + str q11, [x0, #128] // .................................................................................*.. + // gap // .................................................................................... + sqrdmulh v17.4S, v29.4S, v0.S[3] // ............*....................................................................... + // gap // .................................................................................... + mul v18.4S, v16.4S, v1.S[0] // ....................................*............................................... + // gap // .................................................................................... + mls v21.4S, v14.4S, v8.S[0] // ............................................................*....................... + // gap // .................................................................................... + mul v29.4S, v29.4S, v0.S[2] // ...........*........................................................................ + // gap // .................................................................................... + add v23.4S, v13.4S, v6.4S // .....................................................*.............................. + // gap // .................................................................................... + mls v18.4S, v7.4S, v8.S[0] // ...........................................*........................................ + // gap // .................................................................................... + srshr v28.4S, v21.4S, #23 // ................................................................*................... + // gap // .................................................................................... + mls v29.4S, v17.4S, v8.S[0] // .......................*............................................................ + // gap // .................................................................................... + sqrdmulh v12.4S, v23.4S, v26.4S // .............................................................................*...... + // gap // .................................................................................... + mls v21.4S, v28.4S, v8.4S // ...................................................................*................ + // gap // .................................................................................... + add v15.4S, v27.4S, v30.4S // ......*............................................................................. + // gap // .................................................................................... + add v17.4S, v29.4S, v18.4S // ................................................*................................... + // gap // .................................................................................... + sub v20.4S, v29.4S, v18.4S // ...............................................*.................................... + // gap // .................................................................................... + str q21, [x0, #896] // ........................................................................*........... + // gap // .................................................................................... + sqrdmulh v16.4S, v17.4S, v26.4S // ..........................................................................*......... + // gap // .................................................................................... + mul v17.4S, v17.4S, v25.4S // .........................................................................*.......... + // gap // .................................................................................... + sqrdmulh v6.4S, v20.4S, v0.S[1] // ....................................................*............................... + // gap // .................................................................................... + mul v30.4S, v20.4S, v0.S[0] // ...................................................*................................ + // gap // .................................................................................... + sub v20.4S, v15.4S, v10.4S // .............*...................................................................... + // gap // .................................................................................... + mls v17.4S, v16.4S, v8.S[0] // ..............................................................................*..... + // gap // .................................................................................... + add v18.4S, v15.4S, v10.4S // ..............*..................................................................... + // gap // .................................................................................... + mls v30.4S, v6.4S, v8.S[0] // ........................................................*........................... + // gap // .................................................................................... + mul v21.4S, v20.4S, v0.S[0] // ...........................*........................................................ + // gap // .................................................................................... + str q17, [x0, #256] // ..................................................................................*. + // gap // .................................................................................... + sqrdmulh v24.4S, v20.4S, v0.S[1] // ............................*....................................................... + // gap // .................................................................................... + srshr v19.4S, v30.4S, #23 // ...............................................................*.................... + // gap // .................................................................................... + sqrdmulh v20.4S, v18.4S, v26.4S // ..............................*..................................................... + // gap // .................................................................................... + mul v5.4S, v18.4S, v25.4S // .............................*...................................................... + // gap // .................................................................................... + mls v30.4S, v19.4S, v8.4S // ..................................................................*................. + // gap // .................................................................................... + mls v21.4S, v24.4S, v8.S[0] // ..................................*................................................. + // gap // .................................................................................... + mul v15.4S, v23.4S, v25.4S // ............................................................................*....... + // gap // .................................................................................... + mls v5.4S, v20.4S, v8.S[0] // ...........................................................*........................ + // gap // .................................................................................... + str q30, [x0, #768] // ......................................................................*............. + // gap // .................................................................................... + srshr v16.4S, v21.4S, #23 // .........................................................*.......................... + // gap // .................................................................................... + mls v15.4S, v12.4S, v8.S[0] // ................................................................................*... + // gap // .................................................................................... + str q5, [x0], #(16) // ...............................................................................*.... + // gap // .................................................................................... + mls v21.4S, v16.4S, v8.4S // .............................................................*...................... + // gap // .................................................................................... + // gap // .................................................................................... + // gap // .................................................................................... + str q15, [x0, #368] // ...................................................................................* + // gap // .................................................................................... + // gap // .................................................................................... + // gap // .................................................................................... + str q21, [x0, #496] // .................................................................*.................. + // gap // .................................................................................... + + // original source code + // sub v16.4S, v17.4S, v23.4S // .*.................................................................................. + // add v23.4S, v17.4S, v23.4S // .................................*.................................................. + // sub v21.4S, v21.4S, v20.4S // .......*............................................................................ + // mul v20.4S, v16.4S, v1.S[2] // ....*............................................................................... + // sqrdmulh v16.4S, v16.4S, v1.S[3] // .....*.............................................................................. + // sub v17.4S, v23.4S, v30.4S // ......................................*............................................. + // add v23.4S, v23.4S, v30.4S // .......................................................*............................ + // mul v30.4S, v21.4S, v2.S[0] // ..........*......................................................................... + // sqrdmulh v21.4S, v21.4S, v2.S[1] // ...........*........................................................................ + // mls v20.4S, v16.4S, v8.S[0] // ..............*..................................................................... + // sub v16.4S, v4.4S, v11.4S // *................................................................................... + // mul v4.4S, v17.4S, v0.S[2] // ................................................*................................... + // sqrdmulh v17.4S, v17.4S, v0.S[3] // .............................................*...................................... + // sub v11.4S, v23.4S, v10.4S // ...............................................................*.................... + // add v23.4S, v23.4S, v10.4S // .................................................................*.................. + // mls v30.4S, v21.4S, v8.S[0] // ...............*.................................................................... + // mul v21.4S, v16.4S, v2.S[2] // ...*................................................................................ + // sqrdmulh v16.4S, v16.4S, v2.S[3] // ........*........................................................................... + // sub v13.4S, v13.4S, v7.4S // ..*................................................................................. + // sub v7.4S, v20.4S, v30.4S // ........................*........................................................... + // add v20.4S, v20.4S, v30.4S // ...................*................................................................ + // mls v21.4S, v16.4S, v8.S[0] // ............*....................................................................... + // mul v16.4S, v13.4S, v3.S[0] // .........*.......................................................................... + // mls v4.4S, v17.4S, v8.S[0] // ....................................................*............................... + // sqrdmulh v17.4S, v13.4S, v3.S[1] // ......*............................................................................. + // mul v30.4S, v7.4S, v0.S[2] // ...............................*.................................................... + // sqrdmulh v13.4S, v7.4S, v0.S[3] // ............................*....................................................... + // mul v7.4S, v11.4S, v0.S[0] // ...................................................................*................ + // sqrdmulh v11.4S, v11.4S, v0.S[1] // .....................................................................*.............. + // mul v10.4S, v23.4S, v25.4S // ........................................................................*........... + // sqrdmulh v23.4S, v23.4S, v26.4S // .......................................................................*............ + // mls v16.4S, v17.4S, v8.S[0] // .............*...................................................................... + // mls v30.4S, v13.4S, v8.S[0] // ...................................*................................................ + // sub v17.4S, v19.4S, v6.4S // ................*................................................................... + // mls v7.4S, v11.4S, v8.S[0] // ..........................................................................*......... + // sub v11.4S, v21.4S, v16.4S // .................*.................................................................. + // mul v13.4S, v17.4S, v1.S[0] // ..............................................*..................................... + // sqrdmulh v17.4S, v17.4S, v1.S[1] // ................................*................................................... + // add v16.4S, v21.4S, v16.4S // ..................*................................................................. + // mul v21.4S, v11.4S, v1.S[0] // .....................*.............................................................. + // sqrdmulh v11.4S, v11.4S, v1.S[1] // ....................*............................................................... + // sub v19.4S, v20.4S, v16.4S // ......................*............................................................. + // add v16.4S, v20.4S, v16.4S // .......................*............................................................ + // mls v13.4S, v17.4S, v8.S[0] // ..................................................*................................. + // mls v21.4S, v11.4S, v8.S[0] // ...........................*........................................................ + // mul v20.4S, v19.4S, v0.S[0] // ..........................*......................................................... + // sqrdmulh v17.4S, v19.4S, v0.S[1] // .........................*.......................................................... + // sub v11.4S, v4.4S, v13.4S // .........................................................*.......................... + // add v4.4S, v4.4S, v13.4S // ........................................................*........................... + // sub v13.4S, v30.4S, v21.4S // .......................................*............................................ + // mls v20.4S, v17.4S, v8.S[0] // ..............................*..................................................... + // mul v17.4S, v11.4S, v0.S[0] // ..............................................................*..................... + // sqrdmulh v11.4S, v11.4S, v0.S[1] // .............................................................*...................... + // add v21.4S, v30.4S, v21.4S // .................................................*.................................. + // mul v30.4S, v13.4S, v0.S[0] // ...........................................*........................................ + // sqrdmulh v13.4S, v13.4S, v0.S[1] // ..........................................*......................................... + // mls v17.4S, v11.4S, v8.S[0] // ..................................................................*................. + // srshr v11.4S, v7.4S, #23 // ..............................................................................*..... + // srshr v19.4S, v20.4S, #23 // ..................................*................................................. + // mls v10.4S, v23.4S, v8.S[0] // ............................................................................*....... + // mls v30.4S, v13.4S, v8.S[0] // ...............................................*.................................... + // mls v7.4S, v11.4S, v8.4S // .................................................................................*.. + // mls v20.4S, v19.4S, v8.4S // .....................................*.............................................. + // srshr v23.4S, v17.4S, #23 // ......................................................................*............. + // srshr v11.4S, v30.4S, #23 // ...................................................*................................ + // str q7, [x0, #512] // ...................................................................................* + // mls v17.4S, v23.4S, v8.4S // .........................................................................*.......... + // mls v30.4S, v11.4S, v8.4S // ......................................................*............................. + // str q20, [x0, #640] // .........................................*.......................................... + // mul v23.4S, v16.4S, v25.4S // ....................................*............................................... + // str q17, [x0, #768] // .............................................................................*...... + // sqrdmulh v16.4S, v16.4S, v26.4S // .............................*...................................................... + // str q30, [x0, #896] // ..........................................................*......................... + // mul v20.4S, v4.4S, v25.4S // ............................................................*....................... + // sqrdmulh v17.4S, v4.4S, v26.4S // ...........................................................*........................ + // mls v23.4S, v16.4S, v8.S[0] // ........................................*........................................... + // mul v16.4S, v21.4S, v25.4S // ...........................................................................*........ + // sqrdmulh v21.4S, v21.4S, v26.4S // .....................................................*.............................. + // mls v20.4S, v17.4S, v8.S[0] // ................................................................*................... + // str q10, [x0], #(16) // ................................................................................*... + // mls v16.4S, v21.4S, v8.S[0] // ...............................................................................*.... + // str q23, [x0, #112] // ............................................*....................................... + // str q20, [x0, #240] // ....................................................................*............... + // str q16, [x0, #368] // ..................................................................................*. + + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a72.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a72.s new file mode 100644 index 0000000..702bac0 --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a72.s @@ -0,0 +1,2327 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_opt_a72 + .global _intt_dilithium_123_45678_opt_a72 + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_opt_a72: +_intt_dilithium_123_45678_opt_a72: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + ldr q21, [x5, #96] // ........*............................................................................................................................ + ld4 {v26.4S, v27.4S, v28.4S, v29.4S}, [x1] // ...*................................................................................................................................. + ld4 {v22.4S, v23.4S, v24.4S, v25.4S}, [x2] // ..................................*.................................................................................................. + ldr q20, [x5, #64] // ......*.............................................................................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + ldr q31, [x5, #48] // .....*............................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + ldr q5, [x5, #80] // *.................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + add v14.4S, v24.4S, v25.4S // ............................................*........................................................................................ + sub v13.4S, v28.4S, v29.4S // .........*........................................................................................................................... + ldr q10, [x4, #32] // ......................*.............................................................................................................. + add v30.4S, v26.4S, v27.4S // ................*.................................................................................................................... + ldr q0, [x5, #112] // ..........*.......................................................................................................................... + // gap // ..................................................................................................................................... + add v3.4S, v28.4S, v29.4S // ...........*......................................................................................................................... + ldr q4, [x4, #16] // ....................*................................................................................................................ + // gap // ..................................................................................................................................... + sub v6.4S, v22.4S, v23.4S // .......................................*............................................................................................. + ldr q18, [x4], #64 // ..................*.................................................................................................................. + sqrdmulh v7.4S, v13.4S, v5.4S // ...............*..................................................................................................................... + sub v1.4S, v26.4S, v27.4S // .............*....................................................................................................................... + ldr q11, [x5, #160] // ..............*...................................................................................................................... + // gap // ..................................................................................................................................... + ldr q15, [x5, #16] // .*................................................................................................................................... + sub v9.4S, v30.4S, v3.4S // .....................*............................................................................................................... + mul v26.4S, v13.4S, v20.4S // ..........................*.......................................................................................................... + add v27.4S, v30.4S, v3.4S // ........................*............................................................................................................ + ldr q16, [x5, #128] // .......*............................................................................................................................. + // gap // ..................................................................................................................................... + add v3.4S, v22.4S, v23.4S // .........................................*........................................................................................... + sqrdmulh v30.4S, v1.4S, v31.4S // .......................*............................................................................................................. + ldr q28, [x5, #32] // ....*................................................................................................................................ + sub v19.4S, v24.4S, v25.4S // ..........................................*.......................................................................................... + ldr q25, [x5], #(12*16) // ..*.................................................................................................................................. + // gap // ..................................................................................................................................... + mls v26.4S, v7.4S, v8.S[0] // ...........................*......................................................................................................... + ldr q23, [x5, #-48] // ............*........................................................................................................................ + // gap // ..................................................................................................................................... + ldr q17, [x5, #-16] // .................*................................................................................................................... + sub v2.4S, v3.4S, v14.4S // ..............................................*...................................................................................... + // gap // ..................................................................................................................................... + add v22.4S, v3.4S, v14.4S // ...................................................*................................................................................. + mul v28.4S, v1.4S, v28.4S // ...................*................................................................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v28.4S, v30.4S, v8.S[0] // ............................*........................................................................................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v29.4S, v9.4S, v15.4S // ...............................*..................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v14.4S, v9.4S, v25.4S // .............................*....................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v12.4S, v28.4S, v26.4S // ..............................*...................................................................................................... + add v5.4S, v28.4S, v26.4S // ................................*.................................................................................................... + // gap // ..................................................................................................................................... + mul v24.4S, v19.4S, v11.4S // ..................................................*.................................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v9.4S, v12.4S, v15.4S // .................................*................................................................................................... + trn1 v13.4S, v27.4S, v5.4S // ...................................*................................................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn2 v7.4S, v27.4S, v5.4S // .....................................*............................................................................................... + mls v14.4S, v29.4S, v8.S[0] // ......................................*.............................................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v1.4S, v12.4S, v25.4S // ....................................*................................................................................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v1.4S, v9.4S, v8.S[0] // ........................................*............................................................................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v31.4S, v6.4S, v23.4S // .............................................*....................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v19.4S, v19.4S, v17.4S // ...............................................*..................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn1 v9.4S, v14.4S, v1.4S // ................................................*.................................................................................... + mul v30.4S, v6.4S, v16.4S // ...........................................*......................................................................................... + trn2 v6.4S, v14.4S, v1.4S // .................................................*................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v30.4S, v31.4S, v8.S[0] // .....................................................*............................................................................... + // gap // ..................................................................................................................................... + trn2 v1.2D, v13.2D, v9.2D // .......................................................*............................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn2 v3.2D, v7.2D, v6.2D // ........................................................*............................................................................ + mls v24.4S, v19.4S, v8.S[0] // .........................................................*........................................................................... + trn1 v29.2D, v7.2D, v6.2D // ......................................................*.............................................................................. + // gap // ..................................................................................................................................... + trn1 v13.2D, v13.2D, v9.2D // ....................................................*................................................................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + add v20.4S, v1.4S, v3.4S // ..............................................................*...................................................................... + // gap // ..................................................................................................................................... + mul v28.4S, v2.4S, v21.4S // ............................................................*........................................................................ + sub v14.4S, v1.4S, v3.4S // .............................................................*....................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + add v31.4S, v13.4S, v29.4S // ...........................................................*......................................................................... + sqrdmulh v3.4S, v2.4S, v0.4S // .................................................................*................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v12.4S, v30.4S, v24.4S // ................................................................*.................................................................... + sub v15.4S, v13.4S, v29.4S // ..........................................................*.......................................................................... + // gap // ..................................................................................................................................... + mul v7.4S, v14.4S, v10.S[0] // ............................................................................*........................................................ + add v27.4S, v30.4S, v24.4S // ..................................................................*.................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v16.4S, v12.4S, v0.4S // .......................................................................*............................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v9.4S, v12.4S, v21.4S // ....................................................................*................................................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn2 v19.4S, v22.4S, v27.4S // ........................................................................*............................................................ + mls v28.4S, v3.4S, v8.S[0] // .........................................................................*........................................................... + trn1 v23.4S, v22.4S, v27.4S // ......................................................................*.............................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + add v29.4S, v31.4S, v20.4S // .....................................................................*............................................................... + sqrdmulh v3.4S, v14.4S, v10.S[1] // ...............................................................................*..................................................... + sub v14.4S, v31.4S, v20.4S // ...................................................................*................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v9.4S, v16.4S, v8.S[0] // ...........................................................................*......................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v31.4S, v15.4S, v4.S[2] // ...............................................................*..................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v24.4S, v15.4S, v4.S[3] // ..........................................................................*.......................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn1 v21.4S, v28.4S, v9.4S // ..............................................................................*...................................................... + mls v7.4S, v3.4S, v8.S[0] // .......................................................................................*............................................. + trn2 v26.4S, v28.4S, v9.4S // ................................................................................*.................................................... + ldr q9, [x4, #-16] // .........................*........................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v20.4S, v14.4S, v18.S[3] // .................................................................................*................................................... + trn1 v13.2D, v23.2D, v21.2D // ...................................................................................*................................................. + trn1 v3.2D, v19.2D, v26.2D // .....................................................................................*............................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn2 v5.2D, v23.2D, v21.2D // ..................................................................................*.................................................. + mls v31.4S, v24.4S, v8.S[0] // .............................................................................*....................................................... + // gap // ..................................................................................................................................... + trn2 v21.2D, v19.2D, v26.2D // ......................................................................................*.............................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v24.4S, v14.4S, v18.S[2] // ....................................................................................*................................................ + // gap // ..................................................................................................................................... + add v26.4S, v13.4S, v3.4S // ..........................................................................................*.......................................... + sub v0.4S, v13.4S, v3.4S // ........................................................................................*............................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v24.4S, v20.4S, v8.S[0] // .........................................................................................*........................................... + sub v3.4S, v5.4S, v21.4S // ...........................................................................................*......................................... + // gap // ..................................................................................................................................... + add v11.4S, v5.4S, v21.4S // ............................................................................................*........................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v20.4S, v0.4S, v10.S[2] // .............................................................................................*....................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v19.4S, v0.4S, v10.S[3] // ...............................................................................................*..................................... + sub v22.4S, v26.4S, v11.4S // .................................................................................................*................................... + // gap // ..................................................................................................................................... + add v16.4S, v26.4S, v11.4S // ..................................................................................................*.................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v28.4S, v3.4S, v9.S[1] // ...................................................................................................*................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v9.4S, v3.4S, v9.S[0] // ....................................................................................................*................................ + add v3.4S, v31.4S, v7.4S // ................................................................................................*.................................... + // gap // ..................................................................................................................................... + add v12.4S, v29.4S, v16.4S // ......................................................................................................*.............................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v10.4S, v29.4S, v16.4S // .....................................................................................................*............................... + mls v20.4S, v19.4S, v8.S[0] // .......................................................................................................*............................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v9.4S, v28.4S, v8.S[0] // ........................................................................................................*............................ + str q12, [x1], #(16*4) // .........................................................................................................*........................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v29.4S, v10.4S, v18.S[1] // .........................................................................................................................*........... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v26.4S, v22.4S, v4.S[1] // .............................................................................................................*....................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v17.4S, v20.4S, v9.4S // ............................................................................................................*........................ + add v28.4S, v20.4S, v9.4S // ..............................................................................................................*...................... + mul v20.4S, v22.4S, v4.S[0] // ...........................................................................................................*......................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v16.4S, v17.4S, v4.S[0] // ...............................................................................................................*..................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v1.4S, v31.4S, v7.4S // ..............................................................................................*...................................... + sqrdmulh v21.4S, v17.4S, v4.S[1] // .................................................................................................................*................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v20.4S, v26.4S, v8.S[0] // ....................................................................................................................*................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v15.4S, v1.4S, v18.S[2] // ..........................................................................................................*.......................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v25.4S, v1.4S, v18.S[3] // ..................................................................................................................*.................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v5.4S, v24.4S, v20.4S // .......................................................................................................................*............. + add v20.4S, v24.4S, v20.4S // ........................................................................................................................*............ + mul v6.4S, v10.4S, v18.S[0] // ..........................................................................................................................*.......... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v16.4S, v21.4S, v8.S[0] // .....................................................................................................................*............... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + str q20, [x1, #-32] // ...........................................................................................................................*......... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v15.4S, v25.4S, v8.S[0] // ......................................................................................................................*.............. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v6.4S, v29.4S, v8.S[0] // ...............................................................................................................................*..... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v20.4S, v5.4S, v18.S[1] // ..............................................................................................................................*...... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v21.4S, v15.4S, v16.4S // ............................................................................................................................*........ + mul v31.4S, v5.4S, v18.S[0] // .................................................................................................................................*... + // gap // ..................................................................................................................................... + add v16.4S, v15.4S, v16.4S // .............................................................................................................................*....... + str q6, [x2], #(16*4) // ...................................................................................................................................*. + // gap // ..................................................................................................................................... + add v5.4S, v3.4S, v28.4S // ................................................................................................................*.................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v9.4S, v21.4S, v18.S[1] // ....................................................................................................................................* + str q16, [x1, #-16] // ................................................................................................................................*.... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v31.4S, v20.4S, v8.S[0] // ..................................................................................................................................*.. + str q5, [x1, #-48] // ...................................................................................................................*................. + // gap // ..................................................................................................................................... + + // original source code + // ldr q14, [x5, #80] // .....*............................................................................................................................... + // ldr q2, [x5, #16] // ..................*.................................................................................................................. + // ldr q3, [x5], #(12*16) // ...........................*......................................................................................................... + // ld4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1] // .*................................................................................................................................... + // ldr q28, [x5, #-160] // .........................*........................................................................................................... + // ldr q6, [x5, #-144] // ....*................................................................................................................................ + // ldr q15, [x5, #-128] // ...*................................................................................................................................. + // ldr q29, [x5, #-64] // ......................*.............................................................................................................. + // ldr q1, [x5, #-96] // *.................................................................................................................................... + // sub v13.4S, v26.4S, v27.4S // .......*............................................................................................................................. + // ldr q16, [x5, #-80] // ..........*.......................................................................................................................... + // add v26.4S, v26.4S, v27.4S // ...........*......................................................................................................................... + // ldr q9, [x5, #-48] // .............................*....................................................................................................... + // sub v0.4S, v24.4S, v25.4S // ................*.................................................................................................................... + // ldr q30, [x5, #-32] // .................*................................................................................................................... + // sqrdmulh v14.4S, v13.4S, v14.4S // ...............*..................................................................................................................... + // add v24.4S, v24.4S, v25.4S // .........*........................................................................................................................... + // ldr q7, [x5, #-16] // ..............................*...................................................................................................... + // ldr q18, [x4], #64 // ..............*...................................................................................................................... + // mul v28.4S, v0.4S, v28.4S // .................................*................................................................................................... + // ldr q25, [x4, #-48] // ............*........................................................................................................................ + // sub v21.4S, v24.4S, v26.4S // ...................*................................................................................................................. + // ldr q12, [x4, #-32] // ........*............................................................................................................................ + // sqrdmulh v6.4S, v0.4S, v6.4S // ........................*............................................................................................................ + // add v24.4S, v24.4S, v26.4S // .....................*............................................................................................................... + // ldr q26, [x4, #-16] // ................................................................................*.................................................... + // mul v15.4S, v13.4S, v15.4S // ....................*................................................................................................................ + // mls v15.4S, v14.4S, v8.S[0] // ............................*........................................................................................................ + // mls v28.4S, v6.4S, v8.S[0] // ..................................*.................................................................................................. + // mul v17.4S, v21.4S, v3.4S // ....................................*................................................................................................ + // sub v14.4S, v28.4S, v15.4S // .....................................*............................................................................................... + // sqrdmulh v6.4S, v21.4S, v2.4S // ...................................*................................................................................................. + // add v28.4S, v28.4S, v15.4S // ......................................*.............................................................................................. + // sqrdmulh v2.4S, v14.4S, v2.4S // ........................................*............................................................................................ + // ld4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x2] // ..*.................................................................................................................................. + // trn1 v31.4S, v24.4S, v28.4S // .........................................*........................................................................................... + // mul v14.4S, v14.4S, v3.4S // ............................................*........................................................................................ + // trn2 v3.4S, v24.4S, v28.4S // ..........................................*.......................................................................................... + // mls v17.4S, v6.4S, v8.S[0] // ...........................................*......................................................................................... + // sub v24.4S, v19.4S, v20.4S // .............*....................................................................................................................... + // mls v14.4S, v2.4S, v8.S[0] // .............................................*....................................................................................... + // add v2.4S, v19.4S, v20.4S // .......................*............................................................................................................. + // sub v28.4S, v21.4S, v22.4S // ..........................*.......................................................................................................... + // mul v6.4S, v24.4S, v29.4S // .................................................*................................................................................... + // add v21.4S, v21.4S, v22.4S // ......*.............................................................................................................................. + // sqrdmulh v24.4S, v24.4S, v9.4S // ..............................................*...................................................................................... + // sub v15.4S, v2.4S, v21.4S // ...............................*..................................................................................................... + // sqrdmulh v29.4S, v28.4S, v7.4S // ...............................................*..................................................................................... + // trn1 v13.4S, v17.4S, v14.4S // ................................................*.................................................................................... + // trn2 v17.4S, v17.4S, v14.4S // ..................................................*.................................................................................. + // mul v14.4S, v28.4S, v30.4S // .......................................*............................................................................................. + // add v21.4S, v2.4S, v21.4S // ................................*.................................................................................................... + // trn1 v2.2D, v31.2D, v13.2D // ........................................................*............................................................................ + // mls v6.4S, v24.4S, v8.S[0] // ...................................................*................................................................................. + // trn1 v24.2D, v3.2D, v17.2D // .......................................................*............................................................................. + // trn2 v28.2D, v31.2D, v13.2D // ....................................................*................................................................................ + // trn2 v17.2D, v3.2D, v17.2D // .....................................................*............................................................................... + // mls v14.4S, v29.4S, v8.S[0] // ......................................................*.............................................................................. + // sub v3.4S, v2.4S, v24.4S // ...............................................................*..................................................................... + // add v2.4S, v2.4S, v24.4S // ............................................................*........................................................................ + // mul v24.4S, v15.4S, v1.4S // ..........................................................*.......................................................................... + // sub v31.4S, v28.4S, v17.4S // ...........................................................*......................................................................... + // add v17.4S, v28.4S, v17.4S // .........................................................*........................................................................... + // mul v28.4S, v3.4S, v25.S[2] // ...........................................................................*......................................................... + // sub v29.4S, v6.4S, v14.4S // ..............................................................*...................................................................... + // sqrdmulh v15.4S, v15.4S, v16.4S // .............................................................*....................................................................... + // add v14.4S, v6.4S, v14.4S // .................................................................*................................................................... + // sub v6.4S, v2.4S, v17.4S // .........................................................................*........................................................... + // mul v1.4S, v29.4S, v1.4S // ...................................................................*................................................................. + // add v17.4S, v2.4S, v17.4S // .......................................................................*............................................................. + // trn1 v2.4S, v21.4S, v14.4S // ......................................................................*.............................................................. + // sqrdmulh v29.4S, v29.4S, v16.4S // ..................................................................*.................................................................. + // trn2 v21.4S, v21.4S, v14.4S // ....................................................................*................................................................ + // mls v24.4S, v15.4S, v8.S[0] // .....................................................................*............................................................... + // sqrdmulh v14.4S, v3.4S, v25.S[3] // ............................................................................*........................................................ + // mls v1.4S, v29.4S, v8.S[0] // ..........................................................................*.......................................................... + // mul v3.4S, v31.4S, v12.S[0] // ................................................................*.................................................................... + // mls v28.4S, v14.4S, v8.S[0] // .....................................................................................*............................................... + // trn1 v14.4S, v24.4S, v1.4S // .............................................................................*....................................................... + // sqrdmulh v31.4S, v31.4S, v12.S[1] // ........................................................................*............................................................ + // trn2 v24.4S, v24.4S, v1.4S // ...............................................................................*..................................................... + // sqrdmulh v1.4S, v6.4S, v18.S[3] // .................................................................................*................................................... + // trn2 v15.2D, v2.2D, v14.2D // ....................................................................................*................................................ + // trn1 v14.2D, v2.2D, v14.2D // ..................................................................................*.................................................. + // mul v2.4S, v6.4S, v18.S[2] // .......................................................................................*............................................. + // trn1 v6.2D, v21.2D, v24.2D // ...................................................................................*................................................. + // trn2 v21.2D, v21.2D, v24.2D // ......................................................................................*.............................................. + // mls v3.4S, v31.4S, v8.S[0] // ..............................................................................*...................................................... + // sub v24.4S, v14.4S, v6.4S // .........................................................................................*........................................... + // mls v2.4S, v1.4S, v8.S[0] // ..........................................................................................*.......................................... + // add v14.4S, v14.4S, v6.4S // ........................................................................................*............................................ + // sub v6.4S, v15.4S, v21.4S // ...........................................................................................*......................................... + // add v21.4S, v15.4S, v21.4S // ............................................................................................*........................................ + // mul v31.4S, v24.4S, v12.S[2] // .............................................................................................*....................................... + // sub v1.4S, v28.4S, v3.4S // ...............................................................................................................*..................... + // sqrdmulh v24.4S, v24.4S, v12.S[3] // ..............................................................................................*...................................... + // add v3.4S, v28.4S, v3.4S // ...................................................................................................*................................. + // sub v28.4S, v14.4S, v21.4S // ...............................................................................................*..................................... + // add v21.4S, v14.4S, v21.4S // ................................................................................................*.................................... + // sqrdmulh v14.4S, v6.4S, v26.S[1] // .................................................................................................*................................... + // mul v6.4S, v6.4S, v26.S[0] // ..................................................................................................*.................................. + // sub v26.4S, v17.4S, v21.4S // .....................................................................................................*............................... + // add v17.4S, v17.4S, v21.4S // ....................................................................................................*................................ + // mls v31.4S, v24.4S, v8.S[0] // ......................................................................................................*.............................. + // mls v6.4S, v14.4S, v8.S[0] // .......................................................................................................*............................. + // str q17, [x1], #(16*4) // ........................................................................................................*............................ + // mul v17.4S, v1.4S, v18.S[2] // ..................................................................................................................*.................. + // mul v21.4S, v28.4S, v25.S[0] // .............................................................................................................*....................... + // sub v14.4S, v31.4S, v6.4S // ...........................................................................................................*......................... + // sqrdmulh v24.4S, v28.4S, v25.S[1] // ..........................................................................................................*.......................... + // add v28.4S, v31.4S, v6.4S // ............................................................................................................*........................ + // mul v6.4S, v14.4S, v25.S[0] // ..............................................................................................................*...................... + // add v31.4S, v3.4S, v28.4S // ................................................................................................................................*.... + // sqrdmulh v14.4S, v14.4S, v25.S[1] // ................................................................................................................*.................... + // sqrdmulh v1.4S, v1.4S, v18.S[3] // ...................................................................................................................*................. + // str q31, [x1, #-48] // ....................................................................................................................................* + // mls v21.4S, v24.4S, v8.S[0] // .................................................................................................................*................... + // mls v6.4S, v14.4S, v8.S[0] // .......................................................................................................................*............. + // mls v17.4S, v1.4S, v8.S[0] // .........................................................................................................................*........... + // sub v14.4S, v2.4S, v21.4S // ....................................................................................................................*................ + // add v21.4S, v2.4S, v21.4S // .....................................................................................................................*............... + // sqrdmulh v2.4S, v26.4S, v18.S[1] // .........................................................................................................*........................... + // mul v24.4S, v26.4S, v18.S[0] // ......................................................................................................................*.............. + // str q21, [x1, #-32] // ........................................................................................................................*............ + // sub v21.4S, v17.4S, v6.4S // ............................................................................................................................*........ + // add v17.4S, v17.4S, v6.4S // ..............................................................................................................................*...... + // sqrdmulh v6.4S, v14.4S, v18.S[1] // ...........................................................................................................................*......... + // mls v24.4S, v2.4S, v8.S[0] // ..........................................................................................................................*.......... + // str q17, [x1, #-16] // ..................................................................................................................................*.. + // mul v31.4S, v14.4S, v18.S[0] // .............................................................................................................................*....... + // mls v31.4S, v6.4S, v8.S[0] // ...................................................................................................................................*. + // str q24, [x2], #(16*4) // ...............................................................................................................................*..... + // sqrdmulh v9.4S, v21.4S, v18.S[1] // .................................................................................................................................*... + + sub count, count, #1 +layer45678_start: + ldr q14, [x5, #80] // .......e........................................................................................................................................ + sub v17.4S, v3.4S, v28.4S // .......................................................................................................................*........................ + add x1, x1, #64 // ..............................................................................................................................................*. + mul v21.4S, v21.4S, v18.S[0] // ...................................................................................................................................*............ + ldr q2, [x5, #16] // ...e............................................................................................................................................ + ldr q3, [x5], #(12*16) // ..e............................................................................................................................................. + ld4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1] // e............................................................................................................................................... + ldr q28, [x5, #-160] // ....e........................................................................................................................................... + str q31, [x2, #-32] // ............................................................................................................................................*... + ldr q6, [x5, #-144] // .....e.......................................................................................................................................... + mul v31.4S, v17.4S, v18.S[0] // .........................................................................................................................*...................... + // gap // ................................................................................................................................................ + ldr q15, [x5, #-128] // ......e......................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v17.4S, v17.4S, v18.S[1] // ..........................................................................................................................*..................... + ldr q29, [x5, #-64] // ..............................e................................................................................................................. + // gap // ................................................................................................................................................ + ldr q1, [x5, #-96] // ............................e................................................................................................................... + sub v13.4S, v26.4S, v27.4S // .............e.................................................................................................................................. + ldr q16, [x5, #-80] // .............................e.................................................................................................................. + mls v21.4S, v9.4S, v8.S[0] // .....................................................................................................................................*.......... + add v26.4S, v26.4S, v27.4S // ..............e................................................................................................................................. + ldr q9, [x5, #-48] // ...............................e................................................................................................................ + sub v0.4S, v24.4S, v25.4S // ........e....................................................................................................................................... + ldr q30, [x5, #-32] // ................................e............................................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v14.4S, v13.4S, v14.4S // ................e............................................................................................................................... + add v24.4S, v24.4S, v25.4S // .........e...................................................................................................................................... + ldr q7, [x5, #-16] // .................................e.............................................................................................................. + ldr q18, [x4], #64 // ......................................................................e......................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v28.4S, v0.4S, v28.4S // ..........e..................................................................................................................................... + ldr q25, [x4, #-48] // .......................................................................e........................................................................ + // gap // ................................................................................................................................................ + str q21, [x2, #-16] // .............................................................................................................................................*.. + sub v21.4S, v24.4S, v26.4S // ..................e............................................................................................................................. + ldr q12, [x4, #-32] // ........................................................................e....................................................................... + sqrdmulh v6.4S, v0.4S, v6.4S // ...........e.................................................................................................................................... + add v24.4S, v24.4S, v26.4S // ...................e............................................................................................................................ + ldr q26, [x4, #-16] // .........................................................................e...................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v15.4S, v13.4S, v15.4S // ...............e................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v15.4S, v14.4S, v8.S[0] // .................e.............................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v28.4S, v6.4S, v8.S[0] // ............e................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v31.4S, v17.4S, v8.S[0] // ...........................................................................................................................*.................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v17.4S, v21.4S, v3.4S // ....................e........................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v14.4S, v28.4S, v15.4S // .......................e........................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v6.4S, v21.4S, v2.4S // .....................e.......................................................................................................................... + add v28.4S, v28.4S, v15.4S // ........................e....................................................................................................................... + // gap // ................................................................................................................................................ + str q31, [x2, #-48] // ...........................................................................................................................................*.... + add x2, x2, #64 // ...............................................................................................................................................* + // gap // ................................................................................................................................................ + sqrdmulh v2.4S, v14.4S, v2.4S // ..........................e..................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ld4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x2] // .e.............................................................................................................................................. + trn1 v31.4S, v24.4S, v28.4S // ......................................................e......................................................................................... + // gap // ................................................................................................................................................ + mul v14.4S, v14.4S, v3.4S // .........................e...................................................................................................................... + trn2 v3.4S, v24.4S, v28.4S // .......................................................e........................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v17.4S, v6.4S, v8.S[0] // ......................e......................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v24.4S, v19.4S, v20.4S // ..................................e............................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v14.4S, v2.4S, v8.S[0] // ...........................e.................................................................................................................... + add v2.4S, v19.4S, v20.4S // ...................................e............................................................................................................ + // gap // ................................................................................................................................................ + sub v28.4S, v21.4S, v22.4S // .......................................e........................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v6.4S, v24.4S, v29.4S // ....................................e........................................................................................................... + add v21.4S, v21.4S, v22.4S // ........................................e....................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v24.4S, v24.4S, v9.4S // .....................................e.......................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v15.4S, v2.4S, v21.4S // ............................................e................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v29.4S, v28.4S, v7.4S // ..........................................e..................................................................................................... + trn1 v13.4S, v17.4S, v14.4S // ........................................................e....................................................................................... + // gap // ................................................................................................................................................ + trn2 v17.4S, v17.4S, v14.4S // .........................................................e...................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v14.4S, v28.4S, v30.4S // .........................................e...................................................................................................... + add v21.4S, v2.4S, v21.4S // .............................................e.................................................................................................. + // gap // ................................................................................................................................................ + trn1 v2.2D, v31.2D, v13.2D // ............................................................e................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v6.4S, v24.4S, v8.S[0] // ......................................e......................................................................................................... + trn1 v24.2D, v3.2D, v17.2D // .............................................................e.................................................................................. + // gap // ................................................................................................................................................ + trn2 v28.2D, v31.2D, v13.2D // ..........................................................e..................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v17.2D, v3.2D, v17.2D // ...........................................................e.................................................................................... + mls v14.4S, v29.4S, v8.S[0] // ...........................................e.................................................................................................... + // gap // ................................................................................................................................................ + sub v3.4S, v2.4S, v24.4S // ..........................................................................e..................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v2.4S, v2.4S, v24.4S // ...........................................................................e.................................................................... + mul v24.4S, v15.4S, v1.4S // ..............................................e................................................................................................. + // gap // ................................................................................................................................................ + sub v31.4S, v28.4S, v17.4S // ...............................................................................e................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v17.4S, v28.4S, v17.4S // ................................................................................e............................................................... + mul v28.4S, v3.4S, v25.S[2] // ............................................................................e................................................................... + // gap // ................................................................................................................................................ + sub v29.4S, v6.4S, v14.4S // .................................................e.............................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v15.4S, v15.4S, v16.4S // ...............................................e................................................................................................ + add v14.4S, v6.4S, v14.4S // ..................................................e............................................................................................. + // gap // ................................................................................................................................................ + sub v6.4S, v2.4S, v17.4S // ..............................................................................................e................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v1.4S, v29.4S, v1.4S // ...................................................e............................................................................................ + add v17.4S, v2.4S, v17.4S // ...............................................................................................e................................................ + // gap // ................................................................................................................................................ + trn1 v2.4S, v21.4S, v14.4S // ..............................................................e................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v29.4S, v29.4S, v16.4S // ....................................................e........................................................................................... + trn2 v21.4S, v21.4S, v14.4S // ...............................................................e................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v24.4S, v15.4S, v8.S[0] // ................................................e............................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v14.4S, v3.4S, v25.S[3] // .............................................................................e.................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v1.4S, v29.4S, v8.S[0] // .....................................................e.......................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v3.4S, v31.4S, v12.S[0] // .................................................................................e.............................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v28.4S, v14.4S, v8.S[0] // ..............................................................................e................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v14.4S, v24.4S, v1.4S // ................................................................e............................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v31.4S, v31.4S, v12.S[1] // ..................................................................................e............................................................. + trn2 v24.4S, v24.4S, v1.4S // .................................................................e.............................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v1.4S, v6.4S, v18.S[3] // .................................................................................................e.............................................. + trn2 v15.2D, v2.2D, v14.2D // ..................................................................e............................................................................. + // gap // ................................................................................................................................................ + trn1 v14.2D, v2.2D, v14.2D // ....................................................................e........................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v2.4S, v6.4S, v18.S[2] // ................................................................................................e............................................... + trn1 v6.2D, v21.2D, v24.2D // .....................................................................e.......................................................................... + // gap // ................................................................................................................................................ + trn2 v21.2D, v21.2D, v24.2D // ...................................................................e............................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v3.4S, v31.4S, v8.S[0] // ...................................................................................e............................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v24.4S, v14.4S, v6.4S // ....................................................................................e........................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v2.4S, v1.4S, v8.S[0] // ..................................................................................................e............................................. + add v14.4S, v14.4S, v6.4S // .....................................................................................e.......................................................... + // gap // ................................................................................................................................................ + sub v6.4S, v15.4S, v21.4S // .........................................................................................e...................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v21.4S, v15.4S, v21.4S // ..........................................................................................e..................................................... + mul v31.4S, v24.4S, v12.S[2] // ......................................................................................e......................................................... + // gap // ................................................................................................................................................ + sub v1.4S, v28.4S, v3.4S // ...................................................................................................e............................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v24.4S, v24.4S, v12.S[3] // .......................................................................................e........................................................ + add v3.4S, v28.4S, v3.4S // ....................................................................................................e........................................... + // gap // ................................................................................................................................................ + sub v28.4S, v14.4S, v21.4S // ........................................................................................................e....................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v21.4S, v14.4S, v21.4S // .........................................................................................................e...................................... + sqrdmulh v14.4S, v6.4S, v26.S[1] // ............................................................................................e................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v6.4S, v6.4S, v26.S[0] // ...........................................................................................e.................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v26.4S, v17.4S, v21.4S // ..................................................................................................................e............................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v17.4S, v17.4S, v21.4S // ...................................................................................................................e............................ + mls v31.4S, v24.4S, v8.S[0] // ........................................................................................e....................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v6.4S, v14.4S, v8.S[0] // .............................................................................................e.................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q17, [x1], #(16*4) // ......................................................................................................................................e......... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v17.4S, v1.4S, v18.S[2] // .....................................................................................................e.......................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v21.4S, v28.4S, v25.S[0] // ..........................................................................................................e..................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v14.4S, v31.4S, v6.4S // .............................................................................................................e.................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v24.4S, v28.4S, v25.S[1] // ...........................................................................................................e.................................... + add v28.4S, v31.4S, v6.4S // ..............................................................................................................e................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v6.4S, v14.4S, v25.S[0] // ...............................................................................................................e................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v31.4S, v3.4S, v28.4S // ........................................................................................................................e....................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v14.4S, v14.4S, v25.S[1] // ................................................................................................................e............................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v1.4S, v1.4S, v18.S[3] // ......................................................................................................e......................................... + str q31, [x1, #-48] // .......................................................................................................................................e........ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v21.4S, v24.4S, v8.S[0] // ............................................................................................................e................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v6.4S, v14.4S, v8.S[0] // .................................................................................................................e.............................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v17.4S, v1.4S, v8.S[0] // .......................................................................................................e........................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v14.4S, v2.4S, v21.4S // ............................................................................................................................e................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v21.4S, v2.4S, v21.4S // .............................................................................................................................e.................. + sqrdmulh v2.4S, v26.4S, v18.S[1] // .....................................................................................................................e.......................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v24.4S, v26.4S, v18.S[0] // ....................................................................................................................e........................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q21, [x1, #-32] // ........................................................................................................................................e....... + sub v21.4S, v17.4S, v6.4S // .................................................................................................................................e.............. + // gap // ................................................................................................................................................ + add v17.4S, v17.4S, v6.4S // ..................................................................................................................................e............. + sqrdmulh v6.4S, v14.4S, v18.S[1] // ...............................................................................................................................e................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v24.4S, v2.4S, v8.S[0] // ......................................................................................................................e......................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q17, [x1, #-16] // .........................................................................................................................................e...... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v31.4S, v14.4S, v18.S[0] // ..............................................................................................................................e................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v31.4S, v6.4S, v8.S[0] // ................................................................................................................................e............... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q24, [x2], #(16*4) // ..........................................................................................................................................e..... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v9.4S, v21.4S, v18.S[1] // ....................................................................................................................................e........... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + + // original source code + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // ......e.........................................................................................................................................|.....e..................................... + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // .............................................e..................................................................................................|........................................... + // ldr q0, [x5], #(12*16) // .....e..........................................................................................................................................|....e...................................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ....e...........................................................................................................................................|...e....................................... + // ldr q1, [x5, #(-12*16 + 2*16)] // .......e........................................................................................................................................|......e.................................... + // ldr q5, [x5, #(-12*16 + 3*16)] // .........e......................................................................................................................................|........e.................................. + // ldr q2, [x5, #(-12*16 + 4*16)] // ...........e....................................................................................................................................|..........e................................ + // ldr q6, [x5, #(-12*16 + 5*16)] // e...............................................................................................................................................e........................................... + // sub v24.4s, v9.4s, v10.4s // ....................e...........................................................................................................................|...................e....................... + // add v9.4s, v9.4s, v10.4s // .......................e........................................................................................................................|......................e.................... + // mul v10.4s, v24.4s, v1.4s // ..........................e.....................................................................................................................|.........................e................. + // sqrdmulh v24.4s, v24.4s, v5.4s // ...............................e................................................................................................................|..............................e............ + // mls v10.4s, v24.4s, v8.s[0] // ....................................e...........................................................................................................|...................................e....... + // sub v24.4s, v11.4s, v12.4s // ...............e................................................................................................................................|..............e............................ + // add v11.4s, v11.4s, v12.4s // ..................e.............................................................................................................................|.................e......................... + // mul v12.4s, v24.4s, v2.4s // ..................................e.............................................................................................................|.................................e......... + // sqrdmulh v24.4s, v24.4s, v6.4s // ......................e.........................................................................................................................|.....................e..................... + // mls v12.4s, v24.4s, v8.s[0] // ...................................e............................................................................................................|..................................e........ + // sub v24.4s, v9.4s, v11.4s // .............................e..................................................................................................................|............................e.............. + // add v9.4s, v9.4s, v11.4s // ................................e...............................................................................................................|...............................e........... + // mul v11.4s, v24.4s, v0.4s // ......................................e.........................................................................................................|.....................................e..... + // sqrdmulh v24.4s, v24.4s, v4.4s // ........................................e.......................................................................................................|.......................................e... + // mls v11.4s, v24.4s, v8.s[0] // .................................................e..............................................................................................|........................................... + // sub v24.4s, v10.4s, v12.4s // .......................................e........................................................................................................|......................................e.... + // add v10.4s, v10.4s, v12.4s // .........................................e......................................................................................................|........................................e.. + // mul v12.4s, v24.4s, v0.4s // ...............................................e................................................................................................|........................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ............................................e...................................................................................................|........................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................................................e............................................................................................|........................................... + // ldr q0, [x5, #(-12*16 + 6*16)] // ..............e.................................................................................................................................|.............e............................. + // ldr q4, [x5, #(-12*16 + 7*16)] // ................e...............................................................................................................................|...............e........................... + // ldr q1, [x5, #(-12*16 + 8*16)] // .............e..................................................................................................................................|............e.............................. + // ldr q5, [x5, #(-12*16 + 9*16)] // ...................e............................................................................................................................|..................e........................ + // ldr q2, [x5, #(-12*16 + 10*16)] // .....................e..........................................................................................................................|....................e...................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ........................e.......................................................................................................................|.......................e................... + // sub v24.4s, v13.4s, v14.4s // ..................................................e.............................................................................................|........................................... + // add v13.4s, v13.4s, v14.4s // ....................................................e...........................................................................................|........................................... + // mul v14.4s, v24.4s, v1.4s // ......................................................e.........................................................................................|........................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ........................................................e.......................................................................................|........................................... + // mls v14.4s, v24.4s, v8.s[0] // ................................................................e...............................................................................|........................................... + // sub v24.4s, v15.4s, v16.4s // .....................................................e..........................................................................................|........................................... + // add v15.4s, v15.4s, v16.4s // .......................................................e........................................................................................|........................................... + // mul v16.4s, v24.4s, v2.4s // .............................................................e..................................................................................|........................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ..........................................................e.....................................................................................|........................................... + // mls v16.4s, v24.4s, v8.s[0] // ....................................................................e...........................................................................|........................................... + // sub v24.4s, v13.4s, v15.4s // .........................................................e......................................................................................|........................................... + // add v13.4s, v13.4s, v15.4s // ..............................................................e.................................................................................|........................................... + // mul v15.4s, v24.4s, v0.4s // .......................................................................e........................................................................|........................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ............................................................................e...................................................................|........................................... + // mls v15.4s, v24.4s, v8.s[0] // ....................................................................................e...........................................................|........................................... + // sub v24.4s, v14.4s, v16.4s // ...........................................................................e....................................................................|........................................... + // add v14.4s, v14.4s, v16.4s // .............................................................................e..................................................................|........................................... + // mul v16.4s, v24.4s, v0.4s // ...............................................................................e................................................................|........................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..................................................................................e.............................................................|........................................... + // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................e.........................................................|........................................... + // trn1 v25.4s, v9.4s, v10.4s // ..............................................e.................................................................................................|........................................... + // trn2 v26.4s, v9.4s, v10.4s // ................................................e...............................................................................................|........................................... + // trn1 v27.4s, v11.4s, v12.4s // ...........................................................e....................................................................................|........................................... + // trn2 v28.4s, v11.4s, v12.4s // ............................................................e...................................................................................|........................................... + // trn2 v11.2d, v25.2d, v27.2d // ..................................................................e.............................................................................|........................................... + // trn2 v12.2d, v26.2d, v28.2d // ...................................................................e............................................................................|........................................... + // trn1 v9.2d, v25.2d, v27.2d // ...............................................................e................................................................................|........................................... + // trn1 v10.2d, v26.2d, v28.2d // .................................................................e..............................................................................|........................................... + // trn1 v25.4s, v13.4s, v14.4s // .................................................................................e..............................................................|........................................... + // trn2 v26.4s, v13.4s, v14.4s // ...................................................................................e............................................................|........................................... + // trn1 v27.4s, v15.4s, v16.4s // .........................................................................................e......................................................|........................................... + // trn2 v28.4s, v15.4s, v16.4s // ...........................................................................................e....................................................|........................................... + // trn2 v15.2d, v25.2d, v27.2d // .............................................................................................e..................................................|........................................... + // trn2 v16.2d, v26.2d, v28.2d // .................................................................................................e..............................................|........................................... + // trn1 v13.2d, v25.2d, v27.2d // ..............................................................................................e.................................................|........................................... + // trn1 v14.2d, v26.2d, v28.2d // ................................................................................................e...............................................|........................................... + // ldr q0, [x4], #64 // .........................e......................................................................................................................|........................e.................. + // ldr q1, [x4, #(-64 + 16)] // ...........................e....................................................................................................................|..........................e................ + // ldr q2, [x4, #(-64 + 32)] // ..............................e.................................................................................................................|.............................e............. + // ldr q3, [x4, #(-64 + 48)] // .................................e..............................................................................................................|................................e.......... + // sub v24.4s, v9.4s, v10.4s // .....................................................................e..........................................................................|........................................... + // add v9.4s, v9.4s, v10.4s // ......................................................................e.........................................................................|........................................... + // mul v10.4s, v24.4s, v1.s[2] // ..........................................................................e.....................................................................|........................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................................................................................e..........................................................|........................................... + // mls v10.4s, v24.4s, v8.s[0] // ........................................................................................e.......................................................|........................................... + // sub v24.4s, v11.4s, v12.4s // ........................................................................e.......................................................................|........................................... + // add v11.4s, v11.4s, v12.4s // .........................................................................e......................................................................|........................................... + // mul v12.4s, v24.4s, v2.s[0] // .......................................................................................e........................................................|........................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........................................................................................e.....................................................|........................................... + // mls v12.4s, v24.4s, v8.s[0] // ..................................................................................................e.............................................|........................................... + // sub v24.4s, v13.4s, v14.4s // ...................................................................................................e............................................|........................................... + // add v13.4s, v13.4s, v14.4s // .....................................................................................................e..........................................|........................................... + // mul v14.4s, v24.4s, v2.s[2] // ........................................................................................................e.......................................|........................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..........................................................................................................e.....................................|........................................... + // mls v14.4s, v24.4s, v8.s[0] // ..................................................................................................................e.............................|........................................... + // sub v24.4s, v15.4s, v16.4s // ......................................................................................................e.........................................|........................................... + // add v15.4s, v15.4s, v16.4s // .......................................................................................................e........................................|........................................... + // mul v16.4s, v24.4s, v3.s[0] // ...............................................................................................................e................................|........................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ..............................................................................................................e.................................|........................................... + // mls v16.4s, v24.4s, v8.s[0] // ...................................................................................................................e............................|........................................... + // sub v24.4s, v9.4s, v11.4s // ..............................................................................e.................................................................|........................................... + // add v9.4s, v9.4s, v11.4s // ................................................................................e...............................................................|........................................... + // mul v11.4s, v24.4s, v0.s[2] // ...............................................................................................e................................................|........................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ............................................................................................e...................................................|........................................... + // mls v11.4s, v24.4s, v8.s[0] // ....................................................................................................e...........................................|........................................... + // sub v24.4s, v10.4s, v12.4s // .........................................................................................................e......................................|........................................... + // add v10.4s, v10.4s, v12.4s // ...........................................................................................................e....................................|........................................... + // mul v12.4s, v24.4s, v0.s[2] // .....................................................................................................................e..........................|........................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................................................................................................e..................|........................................... + // mls v12.4s, v24.4s, v8.s[0] // .................................................................................................................................e..............|........................................... + // sub v24.4s, v13.4s, v15.4s // ............................................................................................................e...................................|........................................... + // add v13.4s, v13.4s, v15.4s // .............................................................................................................e..................................|........................................... + // mul v15.4s, v24.4s, v1.s[0] // ......................................................................................................................e.........................|........................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................................................................................................................e.......................|........................................... + // mls v15.4s, v24.4s, v8.s[0] // ...............................................................................................................................e................|........................................... + // sub v24.4s, v14.4s, v16.4s // .......................................................................................................................e........................|........................................... + // add v14.4s, v14.4s, v16.4s // .........................................................................................................................e......................|........................................... + // mul v16.4s, v24.4s, v1.s[0] // ..........................................................................................................................e.....................|........................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ............................................................................................................................e...................|........................................... + // mls v16.4s, v24.4s, v8.s[0] // ................................................................................................................................e...............|........................................... + // sub v24.4s, v9.4s, v13.4s // ................................................................................................................e...............................|........................................... + // add v9.4s, v9.4s, v13.4s // .................................................................................................................e..............................|........................................... + // mul v13.4s, v24.4s, v0.s[0] // .....................................................................................................................................e..........|........................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ....................................................................................................................................e...........|........................................... + // mls v13.4s, v24.4s, v8.s[0] // ..........................................................................................................................................e.....|........................................... + // sub v24.4s, v10.4s, v14.4s // .*..............................................................................................................................................|*.......................................... + // add v10.4s, v10.4s, v14.4s // ...........................................................................................................................e....................|........................................... + // mul v14.4s, v24.4s, v0.s[0] // ..........*.....................................................................................................................................|.........*................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............*...................................................................................................................................|...........*............................... + // mls v14.4s, v24.4s, v8.s[0] // .....................................*..........................................................................................................|....................................*...... + // sub v24.4s, v11.4s, v15.4s // ..................................................................................................................................e.............|........................................... + // add v11.4s, v11.4s, v15.4s // ...................................................................................................................................e............|........................................... + // mul v15.4s, v24.4s, v0.s[0] // ............................................................................................................................................e...|........................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................................................................................................................................e......|........................................... + // mls v15.4s, v24.4s, v8.s[0] // .............................................................................................................................................e..|........................................... + // sub v24.4s, v12.4s, v16.4s // .......................................................................................................................................e........|........................................... + // add v12.4s, v12.4s, v16.4s // ........................................................................................................................................e.......|........................................... + // mul v16.4s, v24.4s, v0.s[0] // ...*............................................................................................................................................|..*........................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................................................................................................e|........................................... + // mls v16.4s, v24.4s, v8.s[0] // .................*..............................................................................................................................|................*.......................... + // str q9, [x1], #(16*4) // ....................................................................................................................e...........................|........................................... + // str q10, [x1, #(-16*4 + 1*16)] // ..............................................................................................................................e.................|........................................... + // str q11, [x1, #(-16*4 + 2*16)] // ......................................................................................................................................e.........|........................................... + // str q12, [x1, #(-16*4 + 3*16)] // ...........................................................................................................................................e....|........................................... + // str q13, [x2], #(16*4) // ..............................................................................................................................................e.|........................................... + // str q14, [x2, #(-16*4 + 1*16)] // ..........................................*.....................................................................................................|.........................................*. + // str q15, [x2, #(-16*4 + 2*16)] // ........*.......................................................................................................................................|.......*................................... + // str q16, [x2, #(-16*4 + 3*16)] // ............................*...................................................................................................................|...........................*............... + // add x1, x1, #64 // ..*.............................................................................................................................................|.*......................................... + // add x2, x2, #64 // ...........................................*....................................................................................................|..........................................* + + sub count, count, #1 + cbnz count, layer45678_start + sub v30.4S, v3.4S, v28.4S // *.......... + add x1, x1, #64 // .*......... + mul v2.4S, v21.4S, v18.S[0] // ..*........ + str q31, [x2, #-32] // ...*....... + // gap // ........... + // gap // ........... + // gap // ........... + // gap // ........... + // gap // ........... + sqrdmulh v29.4S, v30.4S, v18.S[1] // .....*..... + // gap // ........... + // gap // ........... + // gap // ........... + // gap // ........... + // gap // ........... + mul v30.4S, v30.4S, v18.S[0] // ....*...... + // gap // ........... + // gap // ........... + // gap // ........... + // gap // ........... + // gap // ........... + mls v2.4S, v9.4S, v8.S[0] // ......*.... + // gap // ........... + // gap // ........... + // gap // ........... + // gap // ........... + // gap // ........... + mls v30.4S, v29.4S, v8.S[0] // ........*.. + // gap // ........... + // gap // ........... + // gap // ........... + // gap // ........... + // gap // ........... + // gap // ........... + // gap // ........... + // gap // ........... + str q2, [x2, #-16] // .......*... + // gap // ........... + // gap // ........... + // gap // ........... + // gap // ........... + // gap // ........... + str q30, [x2, #-48] // .........*. + add x2, x2, #64 // ..........* + // gap // ........... + + // original source code + // sub v17.4S, v3.4S, v28.4S // *.......... + // add x1, x1, #64 // .*......... + // mul v21.4S, v21.4S, v18.S[0] // ..*........ + // str q31, [x2, #-32] // ...*....... + // mul v31.4S, v17.4S, v18.S[0] // .....*..... + // sqrdmulh v17.4S, v17.4S, v18.S[1] // ....*...... + // mls v21.4S, v9.4S, v8.S[0] // ......*.... + // str q21, [x2, #-16] // ........*.. + // mls v31.4S, v17.4S, v8.S[0] // .......*... + // str q31, [x2, #-48] // .........*. + // add x2, x2, #64 // ..........* + + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + // gap // ........ + ldr q13, [x0, #768] // ..*..... + ldr q30, [x0, #896] // ...*.... + ldr q15, [x0, #512] // *....... + ldr q29, [x0, #640] // .*...... + // gap // ........ + // gap // ........ + // gap // ........ + // gap // ........ + // gap // ........ + // gap // ........ + // gap // ........ + // gap // ........ + // gap // ........ + sub v12.4S, v13.4S, v30.4S // .....*.. + sub v24.4S, v15.4S, v29.4S // ....*... + // gap // ........ + // gap // ........ + // gap // ........ + // gap // ........ + // gap // ........ + sqrdmulh v7.4S, v12.4S, v3.S[1] // .......* + // gap // ........ + // gap // ........ + // gap // ........ + // gap // ........ + // gap // ........ + sqrdmulh v31.4S, v24.4S, v2.S[3] // ......*. + // gap // ........ + // gap // ........ + + // original source code + // ldr q15, [x0, #512] // ..*..... + // ldr q29, [x0, #640] // ...*.... + // ldr q13, [x0, #768] // *....... + // ldr q30, [x0, #896] // .*...... + // sub v24.4S, v15.4S, v29.4S // .....*.. + // sub v12.4S, v13.4S, v30.4S // ....*... + // sqrdmulh v31.4S, v24.4S, v2.S[3] // .......* + // sqrdmulh v7.4S, v12.4S, v3.S[1] // ......*. + + sub count, count, #1 +layer123_start: + mul v17.4S, v24.4S, v2.S[2] // ....................*........................................................................... + ldr q21, [x0, #0] // *............................................................................................... + ldr q14, [x0, #128] // .*.............................................................................................. + ldr q24, [x0, #256] // ..*............................................................................................. + ldr q28, [x0, #384] // ...*............................................................................................ + add v6.4S, v15.4S, v29.4S // ...................*............................................................................ + mls v17.4S, v31.4S, v8.S[0] // ......................*......................................................................... + add v31.4S, v13.4S, v30.4S // ........................*....................................................................... + ldr q15, [x0, #528] // ....e........................................................................................... + ldr q29, [x0, #656] // .....e.......................................................................................... + ldr q13, [x0, #784] // ......e......................................................................................... + // gap // ................................................................................................ + mul v16.4S, v12.4S, v3.S[0] // .........................*...................................................................... + sub v9.4S, v21.4S, v14.4S // ........*....................................................................................... + ldr q30, [x0, #912] // .......e........................................................................................ + add v21.4S, v21.4S, v14.4S // .........*...................................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v16.4S, v7.4S, v8.S[0] // ...........................*.................................................................... + sub v14.4S, v24.4S, v28.4S // .............*.................................................................................. + // gap // ................................................................................................ + add v24.4S, v24.4S, v28.4S // ..............*................................................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v28.4S, v9.4S, v1.S[2] // ..........*..................................................................................... + sub v7.4S, v6.4S, v31.4S // ......................................*......................................................... + // gap // ................................................................................................ + add v6.4S, v6.4S, v31.4S // .......................................*........................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v31.4S, v9.4S, v1.S[3] // ...........*.................................................................................... + sub v9.4S, v21.4S, v24.4S // ............................*................................................................... + // gap // ................................................................................................ + sub v18.4S, v17.4S, v16.4S // ...........................................*.................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + add v17.4S, v17.4S, v16.4S // ............................................*................................................... + mul v16.4S, v14.4S, v2.S[0] // ...............*................................................................................ + // gap // ................................................................................................ + add v21.4S, v21.4S, v24.4S // .............................*.................................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v14.4S, v14.4S, v2.S[1] // ................*............................................................................... + sub v24.4S, v15.4S, v29.4S // ..................e............................................................................. + // gap // ................................................................................................ + sub v12.4S, v13.4S, v30.4S // .......................e........................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v28.4S, v31.4S, v8.S[0] // ............*................................................................................... + sub v31.4S, v21.4S, v6.4S // ................................................*............................................... + // gap // ................................................................................................ + add v21.4S, v21.4S, v6.4S // .................................................*.............................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v6.4S, v9.4S, v0.S[2] // ..............................*................................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v16.4S, v14.4S, v8.S[0] // .................*.............................................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v14.4S, v9.4S, v0.S[3] // ...............................*................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v9.4S, v7.4S, v1.S[0] // ........................................*....................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + sub v23.4S, v28.4S, v16.4S // .................................*.............................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v7.4S, v7.4S, v1.S[1] // .........................................*...................................................... + add v28.4S, v28.4S, v16.4S // ..................................*............................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v6.4S, v14.4S, v8.S[0] // ................................*............................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + sub v14.4S, v28.4S, v17.4S // .....................................................*.......................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + add v17.4S, v28.4S, v17.4S // ......................................................*......................................... + mul v28.4S, v23.4S, v0.S[2] // ...................................*............................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v16.4S, v23.4S, v0.S[3] // ....................................*........................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v23.4S, v18.4S, v1.S[0] // .............................................*.................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v18.4S, v18.4S, v1.S[1] // ..............................................*................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v28.4S, v16.4S, v8.S[0] // .....................................*.......................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v9.4S, v7.4S, v8.S[0] // ..........................................*..................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v23.4S, v18.4S, v8.S[0] // ...............................................*................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v16.4S, v31.4S, v0.S[0] // ..................................................*............................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + sub v7.4S, v6.4S, v9.4S // ..........................................................*..................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v31.4S, v31.4S, v0.S[1] // ...................................................*............................................ + add v6.4S, v6.4S, v9.4S // ...........................................................*.................................... + // gap // ................................................................................................ + sub v9.4S, v28.4S, v23.4S // ...............................................................*................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + add v28.4S, v28.4S, v23.4S // ................................................................*............................... + mul v18.4S, v14.4S, v0.S[0] // .......................................................*........................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v14.4S, v14.4S, v0.S[1] // ........................................................*....................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v23.4S, v21.4S, v25.4S // ................................................................................*............... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v21.4S, v21.4S, v26.4S // .................................................................................*.............. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v16.4S, v31.4S, v8.S[0] // ....................................................*........................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v18.4S, v14.4S, v8.S[0] // .........................................................*...................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v14.4S, v7.4S, v0.S[0] // ............................................................*................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + srshr v31.4S, v16.4S, #23 // ....................................................................*........................... + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v7.4S, v7.4S, v0.S[1] // .............................................................*.................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + srshr v10.4S, v18.4S, #23 // ......................................................................*......................... + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v4.4S, v9.4S, v0.S[0] // .................................................................*.............................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v9.4S, v9.4S, v0.S[1] // ..................................................................*............................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v14.4S, v7.4S, v8.S[0] // ..............................................................*................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v16.4S, v31.4S, v8.4S // .....................................................................*.......................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v4.4S, v9.4S, v8.S[0] // ...................................................................*............................ + // gap // ................................................................................................ + // gap // ................................................................................................ + srshr v31.4S, v14.4S, #23 // ........................................................................*....................... + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v18.4S, v10.4S, v8.4S // .......................................................................*........................ + // gap // ................................................................................................ + // gap // ................................................................................................ + str q16, [x0, #512] // ............................................................................*................... + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v14.4S, v31.4S, v8.4S // .........................................................................*...................... + // gap // ................................................................................................ + // gap // ................................................................................................ + srshr v31.4S, v4.4S, #23 // ..........................................................................*..................... + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v23.4S, v21.4S, v8.S[0] // ..................................................................................*............. + // gap // ................................................................................................ + // gap // ................................................................................................ + str q18, [x0, #640] // .............................................................................*.................. + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v4.4S, v31.4S, v8.4S // ...........................................................................*.................... + // gap // ................................................................................................ + // gap // ................................................................................................ + str q14, [x0, #768] // ..............................................................................*................. + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v21.4S, v17.4S, v25.4S // ...................................................................................*............ + // gap // ................................................................................................ + // gap // ................................................................................................ + str q23, [x0], #(16) // ............................................................................................*... + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v17.4S, v17.4S, v26.4S // ....................................................................................*........... + // gap // ................................................................................................ + // gap // ................................................................................................ + str q4, [x0, #880] // ...............................................................................*................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v14.4S, v6.4S, v25.4S // ......................................................................................*......... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v6.4S, v6.4S, v26.4S // .......................................................................................*........ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v21.4S, v17.4S, v8.S[0] // .....................................................................................*.......... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v17.4S, v28.4S, v26.4S // ..........................................................................................*..... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v14.4S, v6.4S, v8.S[0] // ........................................................................................*....... + // gap // ................................................................................................ + // gap // ................................................................................................ + str q21, [x0, #112] // .............................................................................................*.. + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v21.4S, v28.4S, v25.4S // .........................................................................................*...... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v21.4S, v17.4S, v8.S[0] // ...........................................................................................*.... + // gap // ................................................................................................ + // gap // ................................................................................................ + str q14, [x0, #240] // ..............................................................................................*. + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v31.4S, v24.4S, v2.S[3] // .....................e.......................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v7.4S, v12.4S, v3.S[1] // ..........................e..................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + str q21, [x0, #368] // ...............................................................................................* + // gap // ................................................................................................ + // gap // ................................................................................................ + + // original source code + // ldr q9, [x0, #0] // ........................................................................................|*.............................................................................................. + // ldr q10, [x0, #(1*(1024/8))] // ........................................................................................|.*............................................................................................. + // ldr q11, [x0, #(2*(1024/8))] // ........................................................................................|..*............................................................................................ + // ldr q12, [x0, #(3*(1024/8))] // ........................................................................................|...*........................................................................................... + // ldr q13, [x0, #(4*(1024/8))] // e.......................................................................................|.......e....................................................................................... + // ldr q14, [x0, #(5*(1024/8))] // .e......................................................................................|........e...................................................................................... + // ldr q15, [x0, #(6*(1024/8))] // ..e.....................................................................................|.........e..................................................................................... + // ldr q16, [x0, #(7*(1024/8))] // .....e..................................................................................|............e.................................................................................. + // sub v24.4s, v9.4s, v10.4s // ....*...................................................................................|...........*................................................................................... + // add v9.4s, v9.4s, v10.4s // ......*.................................................................................|.............*................................................................................. + // mul v10.4s, v24.4s, v1.s[2] // ..........*.............................................................................|.................*............................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .............*..........................................................................|....................*.......................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ......................*.................................................................|.............................*................................................................. + // sub v24.4s, v11.4s, v12.4s // ........*...............................................................................|...............*............................................................................... + // add v11.4s, v11.4s, v12.4s // .........*..............................................................................|................*.............................................................................. + // mul v12.4s, v24.4s, v2.s[0] // .................*......................................................................|........................*...................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...................*....................................................................|..........................*.................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ..........................*.............................................................|.................................*............................................................. + // sub v24.4s, v13.4s, v14.4s // ....................e...................................................................|...........................e................................................................... + // add v13.4s, v13.4s, v14.4s // ........................................................................................|....*.......................................................................................... + // mul v14.4s, v24.4s, v2.s[2] // ........................................................................................*............................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // .....................................................................................e..|............................................................................................e.. + // mls v14.4s, v24.4s, v8.s[0] // ........................................................................................|.....*......................................................................................... + // sub v24.4s, v15.4s, v16.4s // .....................e..................................................................|............................e.................................................................. + // add v15.4s, v15.4s, v16.4s // ........................................................................................|......*........................................................................................ + // mul v16.4s, v24.4s, v3.s[0] // ...*....................................................................................|..........*.................................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ......................................................................................e.|.............................................................................................e. + // mls v16.4s, v24.4s, v8.s[0] // .......*................................................................................|..............*................................................................................ + // sub v24.4s, v9.4s, v11.4s // ..............*.........................................................................|.....................*......................................................................... + // add v9.4s, v9.4s, v11.4s // ..................*.....................................................................|.........................*..................................................................... + // mul v11.4s, v24.4s, v0.s[2] // .........................*..............................................................|................................*.............................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................*............................................................|..................................*............................................................ + // mls v11.4s, v24.4s, v8.s[0] // ................................*.......................................................|.......................................*....................................................... + // sub v24.4s, v10.4s, v12.4s // .............................*..........................................................|....................................*.......................................................... + // add v10.4s, v10.4s, v12.4s // ...............................*........................................................|......................................*........................................................ + // mul v12.4s, v24.4s, v0.s[2] // ...................................*....................................................|..........................................*.................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ....................................*...................................................|...........................................*................................................... + // mls v12.4s, v24.4s, v8.s[0] // .......................................*................................................|..............................................*................................................ + // sub v24.4s, v13.4s, v15.4s // ...........*............................................................................|..................*............................................................................ + // add v13.4s, v13.4s, v15.4s // ............*...........................................................................|...................*........................................................................... + // mul v15.4s, v24.4s, v1.s[0] // ............................*...........................................................|...................................*........................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................*.........................................................|.....................................*......................................................... + // mls v15.4s, v24.4s, v8.s[0] // ........................................*...............................................|...............................................*............................................... + // sub v24.4s, v14.4s, v16.4s // ...............*........................................................................|......................*........................................................................ + // add v14.4s, v14.4s, v16.4s // ................*.......................................................................|.......................*....................................................................... + // mul v16.4s, v24.4s, v1.s[0] // .....................................*..................................................|............................................*.................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................................*.................................................|.............................................*................................................. + // mls v16.4s, v24.4s, v8.s[0] // .........................................*..............................................|................................................*.............................................. + // sub v24.4s, v9.4s, v13.4s // .......................*................................................................|..............................*................................................................ + // add v9.4s, v9.4s, v13.4s // ........................*...............................................................|...............................*............................................................... + // mul v13.4s, v24.4s, v0.s[0] // ..........................................*.............................................|.................................................*............................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................*...........................................|...................................................*........................................... + // mls v13.4s, v24.4s, v8.s[0] // ....................................................*...................................|...........................................................*................................... + // sub v24.4s, v10.4s, v14.4s // .................................*......................................................|........................................*...................................................... + // add v10.4s, v10.4s, v14.4s // ..................................*.....................................................|.........................................*..................................................... + // mul v14.4s, v24.4s, v0.s[0] // ................................................*.......................................|.......................................................*....................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................................*......................................|........................................................*...................................... + // mls v14.4s, v24.4s, v8.s[0] // .....................................................*..................................|............................................................*.................................. + // sub v24.4s, v11.4s, v15.4s // ...........................................*............................................|..................................................*............................................ + // add v11.4s, v11.4s, v15.4s // .............................................*..........................................|....................................................*.......................................... + // mul v15.4s, v24.4s, v0.s[0] // ......................................................*.................................|.............................................................*................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................*...............................|...............................................................*............................... + // mls v15.4s, v24.4s, v8.s[0] // ............................................................*...........................|...................................................................*........................... + // sub v24.4s, v12.4s, v16.4s // ..............................................*.........................................|.....................................................*......................................... + // add v12.4s, v12.4s, v16.4s // ...............................................*........................................|......................................................*........................................ + // mul v16.4s, v24.4s, v0.s[0] // ..........................................................*.............................|.................................................................*............................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................*............................|..................................................................*............................ + // mls v16.4s, v24.4s, v8.s[0] // ..............................................................*.........................|.....................................................................*......................... + // srshr v24.4S, v13.4S, #23 // .......................................................*................................|..............................................................*................................ + // mls v13.4s, v24.4s, v8.4s // .............................................................*..........................|....................................................................*.......................... + // srshr v24.4S, v14.4S, #23 // .........................................................*..............................|................................................................*.............................. + // mls v14.4s, v24.4s, v8.4s // ................................................................*.......................|.......................................................................*....................... + // srshr v24.4S, v15.4S, #23 // ...............................................................*........................|......................................................................*........................ + // mls v15.4s, v24.4s, v8.4s // ..................................................................*.....................|.........................................................................*..................... + // srshr v24.4S, v16.4S, #23 // ...................................................................*....................|..........................................................................*.................... + // mls v16.4s, v24.4s, v8.4s // ......................................................................*.................|.............................................................................*................. + // str q13, [x0, #(4*(1024/8))] // .................................................................*......................|........................................................................*...................... + // str q14, [x0, #(5*(1024/8))] // .....................................................................*..................|............................................................................*.................. + // str q15, [x0, #(6*(1024/8))] // .......................................................................*................|..............................................................................*................ + // str q16, [x0, #(7*(1024/8))] // ...........................................................................*............|..................................................................................*............ + // mul v13.4s, v9.4s, v25.4s // ..................................................*.....................................|.........................................................*..................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ...................................................*....................................|..........................................................*.................................... + // mls v13.4s, v9.4s, v8.s[0] // ....................................................................*...................|...........................................................................*................... + // mul v14.4s, v10.4s, v25.4s // ........................................................................*...............|...............................................................................*............... + // sqrdmulh v10.4s, v10.4s, v26.4s // ..........................................................................*.............|.................................................................................*............. + // mls v14.4s, v10.4s, v8.s[0] // ..............................................................................*.........|.....................................................................................*......... + // mul v15.4s, v11.4s, v25.4s // ............................................................................*...........|...................................................................................*........... + // sqrdmulh v11.4s, v11.4s, v26.4s // .............................................................................*..........|....................................................................................*.......... + // mls v15.4s, v11.4s, v8.s[0] // ................................................................................*.......|.......................................................................................*....... + // mul v16.4s, v12.4s, v25.4s // ..................................................................................*.....|.........................................................................................*..... + // sqrdmulh v12.4s, v12.4s, v26.4s // ...............................................................................*........|......................................................................................*........ + // mls v16.4s, v12.4s, v8.s[0] // ...................................................................................*....|..........................................................................................*.... + // str q13, [x0], #(16) // .........................................................................*..............|................................................................................*.............. + // str q14, [x0, #(-16 + 1*(1024/8))] // .................................................................................*......|........................................................................................*...... + // str q15, [x0, #(-16 + 2*(1024/8))] // ....................................................................................*...|...........................................................................................*... + // str q16, [x0, #(-16 + 3*(1024/8))] // .......................................................................................*|..............................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + mul v17.4S, v24.4S, v2.S[2] // *....................................................................................... + add v21.4S, v15.4S, v29.4S // .....*.................................................................................. + ldr q14, [x0, #0] // .*...................................................................................... + add v24.4S, v13.4S, v30.4S // .......*................................................................................ + ldr q28, [x0, #128] // ..*..................................................................................... + ldr q6, [x0, #256] // ...*.................................................................................... + mls v17.4S, v31.4S, v8.S[0] // ......*................................................................................. + ldr q31, [x0, #384] // ....*................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v15.4S, v12.4S, v3.S[0] // ........*............................................................................... + sub v29.4S, v21.4S, v24.4S // ...............*........................................................................ + // gap // ........................................................................................ + add v21.4S, v21.4S, v24.4S // ................*....................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v15.4S, v7.4S, v8.S[0] // ...........*............................................................................ + sub v24.4S, v14.4S, v28.4S // .........*.............................................................................. + // gap // ........................................................................................ + add v14.4S, v14.4S, v28.4S // ..........*............................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v28.4S, v6.4S, v31.4S // ............*........................................................................... + mul v13.4S, v29.4S, v1.S[0] // ..............................*......................................................... + // gap // ........................................................................................ + add v6.4S, v6.4S, v31.4S // .............*.......................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v31.4S, v24.4S, v1.S[2] // ..............*......................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v16.4S, v17.4S, v15.4S // ...................*.................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + add v17.4S, v17.4S, v15.4S // ....................*................................................................... + sqrdmulh v24.4S, v24.4S, v1.S[3] // .................*...................................................................... + // gap // ........................................................................................ + sub v15.4S, v14.4S, v6.4S // ..................*..................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v29.4S, v29.4S, v1.S[1] // ................................*....................................................... + add v14.4S, v14.4S, v6.4S // ......................*................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v6.4S, v28.4S, v2.S[0] // .....................*.................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v9.4S, v14.4S, v21.4S // .........................*.............................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + add v21.4S, v14.4S, v21.4S // ..........................*............................................................. + sqrdmulh v14.4S, v28.4S, v2.S[1] // .......................*................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v31.4S, v24.4S, v8.S[0] // ........................*............................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v24.4S, v15.4S, v0.S[2] // ...........................*............................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v6.4S, v14.4S, v8.S[0] // ............................*........................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v13.4S, v29.4S, v8.S[0] // ..........................................*............................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v14.4S, v15.4S, v0.S[3] // .............................*.......................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v28.4S, v31.4S, v6.4S // ...............................*........................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v6.4S, v31.4S, v6.4S // .................................*...................................................... + mul v31.4S, v16.4S, v1.S[0] // .......................................*................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v15.4S, v28.4S, v0.S[2] // .....................................*.................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v29.4S, v6.4S, v17.4S // ...................................*.................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v16.4S, v16.4S, v1.S[1] // ........................................*............................................... + add v17.4S, v6.4S, v17.4S // ....................................*................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v24.4S, v14.4S, v8.S[0] // ..................................*..................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v14.4S, v28.4S, v0.S[3] // ......................................*................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v31.4S, v16.4S, v8.S[0] // ...........................................*............................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v28.4S, v24.4S, v13.4S // .............................................*.......................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + add v24.4S, v24.4S, v13.4S // ...............................................*........................................ + mul v6.4S, v9.4S, v0.S[0] // ............................................*........................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v15.4S, v14.4S, v8.S[0] // .........................................*.............................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v14.4S, v9.4S, v0.S[1] // ..............................................*......................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v13.4S, v29.4S, v0.S[0] // ..................................................*..................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v16.4S, v15.4S, v31.4S // ................................................*....................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v29.4S, v29.4S, v0.S[1] // ...................................................*.................................... + add v31.4S, v15.4S, v31.4S // .................................................*...................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v15.4S, v21.4S, v25.4S // ....................................................*................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v21.4S, v21.4S, v26.4S // .....................................................*.................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v6.4S, v14.4S, v8.S[0] // ......................................................*................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v13.4S, v29.4S, v8.S[0] // .......................................................*................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v14.4S, v28.4S, v0.S[0] // ........................................................*............................... + // gap // ........................................................................................ + // gap // ........................................................................................ + srshr v29.4S, v6.4S, #23 // .........................................................*.............................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v28.4S, v28.4S, v0.S[1] // ..........................................................*............................. + // gap // ........................................................................................ + // gap // ........................................................................................ + srshr v9.4S, v13.4S, #23 // ...........................................................*............................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v30.4S, v16.4S, v0.S[0] // ............................................................*........................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v16.4S, v16.4S, v0.S[1] // .............................................................*.......................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v14.4S, v28.4S, v8.S[0] // ..............................................................*......................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v6.4S, v29.4S, v8.4S // ...............................................................*........................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v30.4S, v16.4S, v8.S[0] // ................................................................*....................... + // gap // ........................................................................................ + // gap // ........................................................................................ + srshr v28.4S, v14.4S, #23 // .................................................................*...................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v13.4S, v9.4S, v8.4S // ..................................................................*..................... + // gap // ........................................................................................ + // gap // ........................................................................................ + str q6, [x0, #512] // ...................................................................*.................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v14.4S, v28.4S, v8.4S // ....................................................................*................... + // gap // ........................................................................................ + // gap // ........................................................................................ + srshr v28.4S, v30.4S, #23 // .....................................................................*.................. + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v15.4S, v21.4S, v8.S[0] // ......................................................................*................. + // gap // ........................................................................................ + // gap // ........................................................................................ + str q13, [x0, #640] // .......................................................................*................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v30.4S, v28.4S, v8.4S // ........................................................................*............... + // gap // ........................................................................................ + // gap // ........................................................................................ + str q14, [x0, #768] // .........................................................................*.............. + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v21.4S, v17.4S, v25.4S // ..........................................................................*............. + // gap // ........................................................................................ + // gap // ........................................................................................ + str q15, [x0], #(16) // ...........................................................................*............ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v17.4S, v17.4S, v26.4S // ............................................................................*........... + // gap // ........................................................................................ + // gap // ........................................................................................ + str q30, [x0, #880] // .............................................................................*.......... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v14.4S, v24.4S, v25.4S // ..............................................................................*......... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v24.4S, v24.4S, v26.4S // ...............................................................................*........ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v21.4S, v17.4S, v8.S[0] // ................................................................................*....... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v17.4S, v31.4S, v26.4S // .................................................................................*...... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v14.4S, v24.4S, v8.S[0] // ..................................................................................*..... + // gap // ........................................................................................ + // gap // ........................................................................................ + str q21, [x0, #112] // ...................................................................................*.... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v21.4S, v31.4S, v25.4S // ....................................................................................*... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v21.4S, v17.4S, v8.S[0] // .....................................................................................*.. + // gap // ........................................................................................ + // gap // ........................................................................................ + str q14, [x0, #240] // ......................................................................................*. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q21, [x0, #368] // .......................................................................................* + // gap // ........................................................................................ + // gap // ........................................................................................ + + // original source code + // mul v17.4S, v24.4S, v2.S[2] // *....................................................................................... + // ldr q21, [x0, #0] // ..*..................................................................................... + // ldr q14, [x0, #128] // ....*................................................................................... + // ldr q24, [x0, #256] // .....*.................................................................................. + // ldr q28, [x0, #384] // .......*................................................................................ + // add v6.4S, v15.4S, v29.4S // .*...................................................................................... + // mls v17.4S, v31.4S, v8.S[0] // ......*................................................................................. + // add v31.4S, v13.4S, v30.4S // ...*.................................................................................... + // mul v16.4S, v12.4S, v3.S[0] // ........*............................................................................... + // sub v9.4S, v21.4S, v14.4S // ............*........................................................................... + // add v21.4S, v21.4S, v14.4S // .............*.......................................................................... + // mls v16.4S, v7.4S, v8.S[0] // ...........*............................................................................ + // sub v14.4S, v24.4S, v28.4S // ..............*......................................................................... + // add v24.4S, v24.4S, v28.4S // ................*....................................................................... + // mul v28.4S, v9.4S, v1.S[2] // .................*...................................................................... + // sub v7.4S, v6.4S, v31.4S // .........*.............................................................................. + // add v6.4S, v6.4S, v31.4S // ..........*............................................................................. + // sqrdmulh v31.4S, v9.4S, v1.S[3] // ....................*................................................................... + // sub v9.4S, v21.4S, v24.4S // .....................*.................................................................. + // sub v18.4S, v17.4S, v16.4S // ..................*..................................................................... + // add v17.4S, v17.4S, v16.4S // ...................*.................................................................... + // mul v16.4S, v14.4S, v2.S[0] // ........................*............................................................... + // add v21.4S, v21.4S, v24.4S // .......................*................................................................ + // sqrdmulh v14.4S, v14.4S, v2.S[1] // ...........................*............................................................ + // mls v28.4S, v31.4S, v8.S[0] // ............................*........................................................... + // sub v31.4S, v21.4S, v6.4S // .........................*.............................................................. + // add v21.4S, v21.4S, v6.4S // ..........................*............................................................. + // mul v6.4S, v9.4S, v0.S[2] // .............................*.......................................................... + // mls v16.4S, v14.4S, v8.S[0] // ..............................*......................................................... + // sqrdmulh v14.4S, v9.4S, v0.S[3] // ................................*....................................................... + // mul v9.4S, v7.4S, v1.S[0] // ...............*........................................................................ + // sub v23.4S, v28.4S, v16.4S // .................................*...................................................... + // sqrdmulh v7.4S, v7.4S, v1.S[1] // ......................*................................................................. + // add v28.4S, v28.4S, v16.4S // ..................................*..................................................... + // mls v6.4S, v14.4S, v8.S[0] // ........................................*............................................... + // sub v14.4S, v28.4S, v17.4S // .....................................*.................................................. + // add v17.4S, v28.4S, v17.4S // .......................................*................................................ + // mul v28.4S, v23.4S, v0.S[2] // ....................................*................................................... + // sqrdmulh v16.4S, v23.4S, v0.S[3] // .........................................*.............................................. + // mul v23.4S, v18.4S, v1.S[0] // ...................................*.................................................... + // sqrdmulh v18.4S, v18.4S, v1.S[1] // ......................................*................................................. + // mls v28.4S, v16.4S, v8.S[0] // ..............................................*......................................... + // mls v9.4S, v7.4S, v8.S[0] // ...............................*........................................................ + // mls v23.4S, v18.4S, v8.S[0] // ..........................................*............................................. + // mul v16.4S, v31.4S, v0.S[0] // .............................................*.......................................... + // sub v7.4S, v6.4S, v9.4S // ...........................................*............................................ + // sqrdmulh v31.4S, v31.4S, v0.S[1] // ...............................................*........................................ + // add v6.4S, v6.4S, v9.4S // ............................................*........................................... + // sub v9.4S, v28.4S, v23.4S // .................................................*...................................... + // add v28.4S, v28.4S, v23.4S // ...................................................*.................................... + // mul v18.4S, v14.4S, v0.S[0] // ................................................*....................................... + // sqrdmulh v14.4S, v14.4S, v0.S[1] // ..................................................*..................................... + // mul v23.4S, v21.4S, v25.4S // ....................................................*................................... + // sqrdmulh v21.4S, v21.4S, v26.4S // .....................................................*.................................. + // mls v16.4S, v31.4S, v8.S[0] // ......................................................*................................. + // mls v18.4S, v14.4S, v8.S[0] // .......................................................*................................ + // mul v14.4S, v7.4S, v0.S[0] // ........................................................*............................... + // srshr v31.4S, v16.4S, #23 // .........................................................*.............................. + // sqrdmulh v7.4S, v7.4S, v0.S[1] // ..........................................................*............................. + // srshr v10.4S, v18.4S, #23 // ...........................................................*............................ + // mul v4.4S, v9.4S, v0.S[0] // ............................................................*........................... + // sqrdmulh v9.4S, v9.4S, v0.S[1] // .............................................................*.......................... + // mls v14.4S, v7.4S, v8.S[0] // ..............................................................*......................... + // mls v16.4S, v31.4S, v8.4S // ...............................................................*........................ + // mls v4.4S, v9.4S, v8.S[0] // ................................................................*....................... + // srshr v31.4S, v14.4S, #23 // .................................................................*...................... + // mls v18.4S, v10.4S, v8.4S // ..................................................................*..................... + // str q16, [x0, #512] // ...................................................................*.................... + // mls v14.4S, v31.4S, v8.4S // ....................................................................*................... + // srshr v31.4S, v4.4S, #23 // .....................................................................*.................. + // mls v23.4S, v21.4S, v8.S[0] // ......................................................................*................. + // str q18, [x0, #640] // .......................................................................*................ + // mls v4.4S, v31.4S, v8.4S // ........................................................................*............... + // str q14, [x0, #768] // .........................................................................*.............. + // mul v21.4S, v17.4S, v25.4S // ..........................................................................*............. + // str q23, [x0], #(16) // ...........................................................................*............ + // sqrdmulh v17.4S, v17.4S, v26.4S // ............................................................................*........... + // str q4, [x0, #880] // .............................................................................*.......... + // mul v14.4S, v6.4S, v25.4S // ..............................................................................*......... + // sqrdmulh v6.4S, v6.4S, v26.4S // ...............................................................................*........ + // mls v21.4S, v17.4S, v8.S[0] // ................................................................................*....... + // sqrdmulh v17.4S, v28.4S, v26.4S // .................................................................................*...... + // mls v14.4S, v6.4S, v8.S[0] // ..................................................................................*..... + // str q21, [x0, #112] // ...................................................................................*.... + // mul v21.4S, v28.4S, v25.4S // ....................................................................................*... + // mls v21.4S, v17.4S, v8.S[0] // .....................................................................................*.. + // str q14, [x0, #240] // ......................................................................................*. + // str q21, [x0, #368] // .......................................................................................* + + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_firestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_firestorm.s new file mode 100644 index 0000000..43d7438 --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_firestorm.s @@ -0,0 +1,2216 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_opt_m1_firestorm + .global _intt_dilithium_123_45678_opt_m1_firestorm + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_opt_m1_firestorm: +_intt_dilithium_123_45678_opt_m1_firestorm: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + ldr q7, [x5, #80] // ..............*............................................................................................................... + ldr q26, [x5, #48] // .........*.................................................................................................................... + ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x1] // .......*...................................................................................................................... + ldr q25, [x5, #16] // ..........*................................................................................................................... + ld4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x2] // ................*............................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + ldr q9, [x5, #32] // ......*....................................................................................................................... + ldr q4, [x5, #144] // ....*......................................................................................................................... + ldr q28, [x5, #96] // ..*........................................................................................................................... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + ldr q5, [x5, #64] // ........*..................................................................................................................... + ldr q6, [x5, #160] // ............*................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + ldr q3, [x5, #176] // .....*........................................................................................................................ + ldr q29, [x5, #112] // ...*.......................................................................................................................... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + ldr q22, [x5, #128] // .....................*........................................................................................................ + ldr q31, [x5], #(12*16) // .............*................................................................................................................ + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + sub v2.4S, v12.4S, v13.4S // ...............................*.............................................................................................. + sub v1.4S, v20.4S, v21.4S // ...................*.......................................................................................................... + add v14.4S, v18.4S, v19.4S // ..................*........................................................................................................... + // gap // .............................................................................................................................. + sub v17.4S, v10.4S, v11.4S // ................................*............................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + add v23.4S, v12.4S, v13.4S // ..................................*........................................................................................... + sub v27.4S, v18.4S, v19.4S // .................*............................................................................................................ + add v0.4S, v10.4S, v11.4S // ....................................*......................................................................................... + add v11.4S, v20.4S, v21.4S // ....................*......................................................................................................... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + mul v24.4S, v2.4S, v6.4S // .....................................*........................................................................................ + sqrdmulh v30.4S, v2.4S, v3.4S // ......................................*....................................................................................... + sqrdmulh v19.4S, v17.4S, v4.4S // .......................................*...................................................................................... + mul v16.4S, v17.4S, v22.4S // ........................................*..................................................................................... + ldr q3, [x4, #16] // .*............................................................................................................................ + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + mul v15.4S, v27.4S, v9.4S // ..........................*................................................................................................... + sqrdmulh v27.4S, v27.4S, v26.4S // ...........................*.................................................................................................. + mul v9.4S, v1.4S, v5.4S // ........................*..................................................................................................... + sqrdmulh v1.4S, v1.4S, v7.4S // .........................*.................................................................................................... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + sub v10.4S, v0.4S, v23.4S // .........................................*.................................................................................... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + sub v6.4S, v14.4S, v11.4S // ......................*....................................................................................................... + // gap // .............................................................................................................................. + mls v24.4S, v30.4S, v8.S[0] // ................................................*............................................................................. + mls v16.4S, v19.4S, v8.S[0] // ...............................................*.............................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + mls v15.4S, v27.4S, v8.S[0] // .................................*............................................................................................ + mls v9.4S, v1.4S, v8.S[0] // ..............................*............................................................................................... + mul v19.4S, v10.4S, v28.4S // ..................................................*........................................................................... + sqrdmulh v10.4S, v10.4S, v29.4S // ......................................................*....................................................................... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + sqrdmulh v27.4S, v6.4S, v25.4S // .............................*................................................................................................ + // gap // .............................................................................................................................. + mul v21.4S, v6.4S, v31.4S // ............................*................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + add v26.4S, v0.4S, v23.4S // ..........................................*................................................................................... + // gap // .............................................................................................................................. + sub v2.4S, v16.4S, v24.4S // ....................................................*......................................................................... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + mls v19.4S, v10.4S, v8.S[0] // ..............................................................*............................................................... + sub v7.4S, v15.4S, v9.4S // ...........................................*.................................................................................. + add v13.4S, v15.4S, v9.4S // ............................................*................................................................................. + add v10.4S, v14.4S, v11.4S // .......................*...................................................................................................... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + mls v21.4S, v27.4S, v8.S[0] // ...................................*.......................................................................................... + sqrdmulh v9.4S, v2.4S, v29.4S // ...........................................................*.................................................................. + mul v27.4S, v2.4S, v28.4S // ..........................................................*................................................................... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + trn2 v12.4S, v10.4S, v13.4S // .......................................................*...................................................................... + trn1 v22.4S, v10.4S, v13.4S // .................................................*............................................................................ + sqrdmulh v4.4S, v7.4S, v25.4S // ..............................................*............................................................................... + ldr q29, [x4, #32] // ...............*.............................................................................................................. + mul v20.4S, v7.4S, v31.4S // .............................................*................................................................................ + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + add v1.4S, v16.4S, v24.4S // ...................................................*.......................................................................... + ldr q14, [x4, #48] // *............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + mls v27.4S, v9.4S, v8.S[0] // ...............................................................*.............................................................. + ldr q9, [x4], #64 // ...........*.................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + mls v20.4S, v4.4S, v8.S[0] // .....................................................*........................................................................ + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + trn2 v7.4S, v26.4S, v1.4S // .........................................................*.................................................................... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + trn2 v18.4S, v19.4S, v27.4S // ......................................................................*....................................................... + trn1 v16.4S, v26.4S, v1.4S // ........................................................*..................................................................... + trn1 v10.4S, v19.4S, v27.4S // .........................................................................*.................................................... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + trn2 v27.4S, v21.4S, v20.4S // ............................................................*................................................................. + trn1 v11.4S, v21.4S, v20.4S // .............................................................*................................................................ + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + trn2 v20.2D, v7.2D, v18.2D // ............................................................................*................................................. + trn1 v30.2D, v7.2D, v18.2D // .............................................................................*................................................ + trn2 v7.2D, v16.2D, v10.2D // ..............................................................................*............................................... + trn1 v2.2D, v16.2D, v10.2D // ...............................................................................*.............................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + trn2 v13.2D, v12.2D, v27.2D // ..................................................................*........................................................... + trn1 v31.2D, v12.2D, v27.2D // ...................................................................*.......................................................... + trn2 v19.2D, v22.2D, v11.2D // ................................................................*............................................................. + trn1 v0.2D, v22.2D, v11.2D // .................................................................*............................................................ + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + sub v22.4S, v2.4S, v30.4S // ..................................................................................*........................................... + sub v18.4S, v7.4S, v20.4S // .....................................................................................*........................................ + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + sub v11.4S, v19.4S, v13.4S // ....................................................................*......................................................... + sub v6.4S, v0.4S, v31.4S // ........................................................................*..................................................... + add v10.4S, v7.4S, v20.4S // ......................................................................................*....................................... + add v1.4S, v2.4S, v30.4S // ....................................................................................*......................................... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + mul v24.4S, v22.4S, v29.S[2] // ........................................................................................*..................................... + sqrdmulh v26.4S, v22.4S, v29.S[3] // .........................................................................................*.................................... + sqrdmulh v22.4S, v18.4S, v14.S[1] // ..........................................................................................*................................... + mul v15.4S, v18.4S, v14.S[0] // ...........................................................................................*.................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + mul v12.4S, v11.4S, v29.S[0] // ..........................................................................*................................................... + sqrdmulh v23.4S, v11.4S, v29.S[1] // ...........................................................................*.................................................. + sqrdmulh v11.4S, v6.4S, v3.S[3] // ................................................................................*............................................. + mul v6.4S, v6.4S, v3.S[2] // .................................................................................*............................................ + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + add v5.4S, v19.4S, v13.4S // .....................................................................*........................................................ + add v31.4S, v0.4S, v31.4S // .......................................................................*...................................................... + add v13.4S, v1.4S, v10.4S // ..............................................................................................*............................... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + mls v24.4S, v26.4S, v8.S[0] // .................................................................................................*............................ + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + mls v6.4S, v11.4S, v8.S[0] // ............................................................................................*................................. + mls v12.4S, v23.4S, v8.S[0] // .......................................................................................*...................................... + mls v15.4S, v22.4S, v8.S[0] // ....................................................................................................*......................... + add v27.4S, v31.4S, v5.4S // ...............................................................................................*.............................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + sub v2.4S, v31.4S, v5.4S // ...................................................................................*.......................................... + sub v10.4S, v1.4S, v10.4S // ........................................................................................................*..................... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + sub v18.4S, v27.4S, v13.4S // ..................................................................................................*........................... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + add v7.4S, v6.4S, v12.4S // ......................................................................................................*....................... + sub v30.4S, v6.4S, v12.4S // .....................................................................................................*........................ + add v28.4S, v24.4S, v15.4S // .............................................................................................................*................ + sqrdmulh v16.4S, v10.4S, v3.S[1] // .................................................................................................................*............ + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + sqrdmulh v26.4S, v18.4S, v9.S[1] // .........................................................................................................*.................... + mul v31.4S, v18.4S, v9.S[0] // ..........................................................................................................*................... + sub v18.4S, v24.4S, v15.4S // ............................................................................................................*................. + mul v4.4S, v10.4S, v3.S[0] // ................................................................................................................*............. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + sub v10.4S, v7.4S, v28.4S // ....................................................................................................................*......... + mul v22.4S, v30.4S, v9.S[2] // ..............................................................................................................*............... + mul v19.4S, v2.4S, v9.S[2] // ................................................................................................*............................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + sqrdmulh v0.4S, v18.4S, v3.S[1] // ..................................................................................................................*........... + mul v3.4S, v18.4S, v3.S[0] // ...................................................................................................................*.......... + sqrdmulh v18.4S, v30.4S, v9.S[3] // ...............................................................................................................*.............. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + mls v31.4S, v26.4S, v8.S[0] // .......................................................................................................................*...... + sqrdmulh v26.4S, v2.4S, v9.S[3] // .............................................................................................*................................ + mul v24.4S, v10.4S, v9.S[0] // ........................................................................................................................*..... + sqrdmulh v10.4S, v10.4S, v9.S[1] // .........................................................................................................................*.... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + add v2.4S, v27.4S, v13.4S // ...................................................................................................*.......................... + mls v4.4S, v16.4S, v8.S[0] // ...........................................................................................................................*.. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + mls v22.4S, v18.4S, v8.S[0] // ......................................................................................................................*....... + mls v3.4S, v0.4S, v8.S[0] // ..........................................................................................................................*... + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + mls v19.4S, v26.4S, v8.S[0] // .......................................................................................................*...................... + add v7.4S, v7.4S, v28.4S // .....................................................................................................................*........ + str q31, [x2], #(16*4) // ............................................................................................................................*. + mls v24.4S, v10.4S, v8.S[0] // .............................................................................................................................* + str q2, [x1], #(16*4) // ...........................................................................................................*.................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + // gap // .............................................................................................................................. + + // original source code + // ldr q14, [x4, #48] // ........................................................*..................................................................... + // ldr q31, [x4, #16] // ..........................*................................................................................................... + // ldr q13, [x5, #96] // .......*...................................................................................................................... + // ldr q11, [x5, #112] // ...........*.................................................................................................................. + // ldr q22, [x5, #144] // ......*....................................................................................................................... + // ldr q25, [x5, #176] // ..........*................................................................................................................... + // ldr q29, [x5, #32] // .....*........................................................................................................................ + // ld4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x1] // ..*........................................................................................................................... + // ldr q5, [x5, #64] // ........*..................................................................................................................... + // ldr q0, [x5, #48] // .*............................................................................................................................ + // ldr q28, [x5, #16] // ...*.......................................................................................................................... + // ldr q9, [x4], #64 // ..........................................................*................................................................... + // ldr q26, [x5, #160] // .........*.................................................................................................................... + // ldr q6, [x5], #(12*16) // .............*................................................................................................................ + // ldr q16, [x5, #-112] // *............................................................................................................................. + // ldr q21, [x4, #-32] // .....................................................*........................................................................ + // ld4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x2] // ....*......................................................................................................................... + // sub v24.4S, v17.4S, v18.4S // ...................*.......................................................................................................... + // add v15.4S, v17.4S, v18.4S // ................*............................................................................................................. + // sub v10.4S, v19.4S, v20.4S // ...............*.............................................................................................................. + // add v19.4S, v19.4S, v20.4S // .....................*........................................................................................................ + // ldr q27, [x5, #-64] // ............*................................................................................................................. + // sub v17.4S, v15.4S, v19.4S // ................................*............................................................................................. + // add v30.4S, v15.4S, v19.4S // ..............................................*............................................................................... + // mul v5.4S, v10.4S, v5.4S // .............................*................................................................................................ + // sqrdmulh v7.4S, v10.4S, v16.4S // ..............................*............................................................................................... + // mul v16.4S, v24.4S, v29.4S // ...........................*.................................................................................................. + // sqrdmulh v29.4S, v24.4S, v0.4S // ............................*................................................................................................. + // mul v23.4S, v17.4S, v6.4S // ........................................*..................................................................................... + // sqrdmulh v19.4S, v17.4S, v28.4S // .......................................*...................................................................................... + // mls v5.4S, v7.4S, v8.S[0] // ....................................*......................................................................................... + // sub v0.4S, v3.4S, v4.4S // ..............*............................................................................................................... + // sub v10.4S, v1.4S, v2.4S // .................*............................................................................................................ + // mls v16.4S, v29.4S, v8.S[0] // ...................................*.......................................................................................... + // add v3.4S, v3.4S, v4.4S // ..................*........................................................................................................... + // mls v23.4S, v19.4S, v8.S[0] // ...............................................*.............................................................................. + // add v20.4S, v1.4S, v2.4S // ....................*......................................................................................................... + // mul v1.4S, v0.4S, v26.4S // ......................*....................................................................................................... + // sqrdmulh v24.4S, v0.4S, v25.4S // .......................*...................................................................................................... + // sqrdmulh v0.4S, v10.4S, v22.4S // ........................*..................................................................................................... + // mul v4.4S, v10.4S, v27.4S // .........................*.................................................................................................... + // sub v18.4S, v20.4S, v3.4S // ...............................*.............................................................................................. + // add v12.4S, v20.4S, v3.4S // .........................................*.................................................................................... + // sub v17.4S, v16.4S, v5.4S // ............................................*................................................................................. + // add v7.4S, v16.4S, v5.4S // .............................................*................................................................................ + // mul v6.4S, v17.4S, v6.4S // ......................................................*....................................................................... + // sqrdmulh v27.4S, v17.4S, v28.4S // ....................................................*......................................................................... + // mls v4.4S, v0.4S, v8.S[0] // ..................................*........................................................................................... + // mls v1.4S, v24.4S, v8.S[0] // .................................*............................................................................................ + // trn1 v25.4S, v30.4S, v7.4S // ...................................................*.......................................................................... + // mul v5.4S, v18.4S, v13.4S // .....................................*........................................................................................ + // add v10.4S, v4.4S, v1.4S // .......................................................*...................................................................... + // sub v29.4S, v4.4S, v1.4S // ..........................................*................................................................................... + // mls v6.4S, v27.4S, v8.S[0] // ...........................................................*.................................................................. + // sqrdmulh v16.4S, v18.4S, v11.4S // ......................................*....................................................................................... + // trn2 v19.4S, v30.4S, v7.4S // ..................................................*........................................................................... + // trn1 v15.4S, v12.4S, v10.4S // ..............................................................*............................................................... + // trn2 v4.4S, v12.4S, v10.4S // ............................................................*................................................................. + // mul v27.4S, v29.4S, v13.4S // .................................................*............................................................................ + // sqrdmulh v11.4S, v29.4S, v11.4S // ................................................*............................................................................. + // trn2 v2.4S, v23.4S, v6.4S // ................................................................*............................................................. + // trn1 v24.4S, v23.4S, v6.4S // .................................................................*............................................................ + // mls v5.4S, v16.4S, v8.S[0] // ...........................................*.................................................................................. + // mls v27.4S, v11.4S, v8.S[0] // .........................................................*.................................................................... + // trn2 v20.2D, v25.2D, v24.2D // ........................................................................*..................................................... + // trn1 v18.2D, v25.2D, v24.2D // .........................................................................*.................................................... + // trn2 v0.2D, v19.2D, v2.2D // ......................................................................*....................................................... + // trn1 v13.2D, v19.2D, v2.2D // .......................................................................*...................................................... + // sub v24.4S, v20.4S, v0.4S // ............................................................................*................................................. + // add v29.4S, v20.4S, v0.4S // ........................................................................................*..................................... + // trn2 v2.4S, v5.4S, v27.4S // .............................................................*................................................................ + // add v1.4S, v18.4S, v13.4S // .........................................................................................*.................................... + // sub v18.4S, v18.4S, v13.4S // .............................................................................*................................................ + // trn1 v3.4S, v5.4S, v27.4S // ...............................................................*.............................................................. + // mul v17.4S, v24.4S, v21.S[0] // ....................................................................................*......................................... + // sqrdmulh v22.4S, v24.4S, v21.S[1] // .....................................................................................*........................................ + // trn2 v26.2D, v4.2D, v2.2D // ..................................................................*........................................................... + // trn1 v19.2D, v4.2D, v2.2D // ...................................................................*.......................................................... + // trn2 v25.2D, v15.2D, v3.2D // ....................................................................*......................................................... + // trn1 v30.2D, v15.2D, v3.2D // .....................................................................*........................................................ + // sqrdmulh v24.4S, v18.4S, v31.S[3] // ......................................................................................*....................................... + // mul v16.4S, v18.4S, v31.S[2] // .......................................................................................*...................................... + // sub v10.4S, v30.4S, v19.4S // ..........................................................................*................................................... + // sub v3.4S, v1.4S, v29.4S // ................................................................................................*............................. + // add v27.4S, v30.4S, v19.4S // ...............................................................................*.............................................. + // sub v2.4S, v25.4S, v26.4S // ...........................................................................*.................................................. + // add v0.4S, v25.4S, v26.4S // ..............................................................................*............................................... + // mls v17.4S, v22.4S, v8.S[0] // .............................................................................................*................................ + // mul v20.4S, v10.4S, v21.S[2] // ................................................................................*............................................. + // sqrdmulh v28.4S, v10.4S, v21.S[3] // .................................................................................*............................................ + // sqrdmulh v11.4S, v2.4S, v14.S[1] // ..................................................................................*........................................... + // mul v7.4S, v2.4S, v14.S[0] // ...................................................................................*.......................................... + // mls v16.4S, v24.4S, v8.S[0] // ............................................................................................*................................. + // sqrdmulh v10.4S, v3.4S, v9.S[3] // ..................................................................................................................*........... + // add v2.4S, v27.4S, v0.4S // ..........................................................................................*................................... + // add v14.4S, v1.4S, v29.4S // ...............................................................................................*.............................. + // mul v19.4S, v3.4S, v9.S[2] // .............................................................................................................*................ + // mls v20.4S, v28.4S, v8.S[0] // ...........................................................................................*.................................. + // sub v29.4S, v14.4S, v2.4S // ..................................................................................................*........................... + // add v3.4S, v14.4S, v2.4S // .....................................................................................................................*........ + // mls v7.4S, v11.4S, v8.S[0] // ..............................................................................................*............................... + // sub v14.4S, v16.4S, v17.4S // ....................................................................................................*......................... + // add v21.4S, v16.4S, v17.4S // ...................................................................................................*.......................... + // mls v19.4S, v10.4S, v8.S[0] // .........................................................................................................................*.... + // sub v10.4S, v27.4S, v0.4S // .................................................................................................*............................ + // sqrdmulh v2.4S, v29.4S, v9.S[1] // .......................................................................................................*...................... + // mul v5.4S, v29.4S, v9.S[0] // ........................................................................................................*..................... + // str q3, [x1], #(16*4) // .............................................................................................................................* + // sub v1.4S, v20.4S, v7.4S // .........................................................................................................*.................... + // add v18.4S, v20.4S, v7.4S // .....................................................................................................*........................ + // mul v22.4S, v14.4S, v9.S[2] // ............................................................................................................*................. + // sqrdmulh v25.4S, v14.4S, v9.S[3] // ................................................................................................................*............. + // mul v4.4S, v10.4S, v31.S[0] // ..........................................................................................................*................... + // sqrdmulh v29.4S, v10.4S, v31.S[1] // ......................................................................................................*....................... + // sqrdmulh v28.4S, v1.4S, v31.S[1] // ..............................................................................................................*............... + // mul v3.4S, v1.4S, v31.S[0] // ...............................................................................................................*.............. + // sub v0.4S, v21.4S, v18.4S // ...........................................................................................................*.................. + // add v7.4S, v21.4S, v18.4S // ..........................................................................................................................*... + // mls v22.4S, v25.4S, v8.S[0] // .......................................................................................................................*...... + // mls v5.4S, v2.4S, v8.S[0] // .................................................................................................................*............ + // mul v24.4S, v0.4S, v9.S[0] // ...................................................................................................................*.......... + // sqrdmulh v10.4S, v0.4S, v9.S[1] // ....................................................................................................................*......... + // mls v3.4S, v28.4S, v8.S[0] // ........................................................................................................................*..... + // mls v4.4S, v29.4S, v8.S[0] // ......................................................................................................................*....... + // str q5, [x2], #(16*4) // ...........................................................................................................................*.. + // mls v24.4S, v10.4S, v8.S[0] // ............................................................................................................................*. + + sub count, count, #1 +layer45678_start: + ldr q14, [x4, #48] // .........................................................................e...................................................................... + add v18.4S, v22.4S, v3.4S // ..................................................................................................................................*............. + add v16.4S, v19.4S, v4.4S // .............................................................................................................................*.................. + sub v2.4S, v19.4S, v4.4S // ............................................................................................................................*................... + sub v12.4S, v22.4S, v3.4S // .................................................................................................................................*.............. + ldr q31, [x4, #16] // .......................................................................e........................................................................ + // gap // ................................................................................................................................................ + ldr q13, [x5, #96] // ............................e................................................................................................................... + ldr q11, [x5, #112] // .............................e.................................................................................................................. + ldr q22, [x5, #144] // ...............................e................................................................................................................ + ldr q25, [x5, #176] // .................................e.............................................................................................................. + str q7, [x1, #-48] // .......................................................................................................................................*........ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q16, [x1, #-32] // ........................................................................................................................................*....... + mul v10.4S, v12.4S, v9.S[0] // ...................................................................................................................................*............ + mul v3.4S, v2.4S, v9.S[0] // ..............................................................................................................................*................. + sqrdmulh v2.4S, v2.4S, v9.S[1] // ...............................................................................................................................*................ + ldr q29, [x5, #32] // ....e........................................................................................................................................... + str q18, [x1, #-16] // .........................................................................................................................................*...... + sqrdmulh v15.4S, v12.4S, v9.S[1] // ....................................................................................................................................*........... + add x1, x1, #64 // ..............................................................................................................................................*. + ld4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x1] // e............................................................................................................................................... + ldr q5, [x5, #64] // ......e......................................................................................................................................... + ldr q0, [x5, #48] // .....e.......................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q28, [x5, #16] // ...e............................................................................................................................................ + ldr q9, [x4], #64 // ......................................................................e......................................................................... + str q24, [x2, #-48] // ...........................................................................................................................................*.... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q26, [x5, #160] // ................................e............................................................................................................... + mls v3.4S, v2.4S, v8.S[0] // ................................................................................................................................*............... + mls v10.4S, v15.4S, v8.S[0] // .....................................................................................................................................*.......... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q6, [x5], #(12*16) // ..e............................................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q16, [x5, #-112] // .......e........................................................................................................................................ + str q10, [x2, #-16] // .............................................................................................................................................*.. + str q3, [x2, #-32] // ............................................................................................................................................*... + add x2, x2, #64 // ...............................................................................................................................................* + ldr q21, [x4, #-32] // ........................................................................e....................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ld4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x2] // .e.............................................................................................................................................. + sub v24.4S, v17.4S, v18.4S // ........e....................................................................................................................................... + add v15.4S, v17.4S, v18.4S // .........e...................................................................................................................................... + sub v10.4S, v19.4S, v20.4S // .............e.................................................................................................................................. + add v19.4S, v19.4S, v20.4S // ..............e................................................................................................................................. + ldr q27, [x5, #-64] // ..............................e................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v17.4S, v15.4S, v19.4S // ..................e............................................................................................................................. + add v30.4S, v15.4S, v19.4S // ...................e............................................................................................................................ + mul v5.4S, v10.4S, v5.4S // ...............e................................................................................................................................ + sqrdmulh v7.4S, v10.4S, v16.4S // ................e............................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v16.4S, v24.4S, v29.4S // ..........e..................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v29.4S, v24.4S, v0.4S // ...........e.................................................................................................................................... + mul v23.4S, v17.4S, v6.4S // ....................e........................................................................................................................... + sqrdmulh v19.4S, v17.4S, v28.4S // .....................e.......................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v5.4S, v7.4S, v8.S[0] // .................e.............................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v0.4S, v3.4S, v4.4S // .......................................e........................................................................................................ + sub v10.4S, v1.4S, v2.4S // ..................................e............................................................................................................. + mls v16.4S, v29.4S, v8.S[0] // ............e................................................................................................................................... + add v3.4S, v3.4S, v4.4S // ........................................e....................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v23.4S, v19.4S, v8.S[0] // ......................e......................................................................................................................... + add v20.4S, v1.4S, v2.4S // ...................................e............................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v1.4S, v0.4S, v26.4S // .........................................e...................................................................................................... + sqrdmulh v24.4S, v0.4S, v25.4S // ..........................................e..................................................................................................... + sqrdmulh v0.4S, v10.4S, v22.4S // .....................................e.......................................................................................................... + mul v4.4S, v10.4S, v27.4S // ....................................e........................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v18.4S, v20.4S, v3.4S // ............................................e................................................................................................... + add v12.4S, v20.4S, v3.4S // .............................................e.................................................................................................. + sub v17.4S, v16.4S, v5.4S // .......................e........................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v7.4S, v16.4S, v5.4S // ........................e....................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v6.4S, v17.4S, v6.4S // .........................e...................................................................................................................... + sqrdmulh v27.4S, v17.4S, v28.4S // ..........................e..................................................................................................................... + mls v4.4S, v0.4S, v8.S[0] // ......................................e......................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v1.4S, v24.4S, v8.S[0] // ...........................................e.................................................................................................... + trn1 v25.4S, v30.4S, v7.4S // ......................................................e......................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v5.4S, v18.4S, v13.4S // ..............................................e................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v10.4S, v4.4S, v1.4S // ..................................................e............................................................................................. + sub v29.4S, v4.4S, v1.4S // .................................................e.............................................................................................. + mls v6.4S, v27.4S, v8.S[0] // ...........................e.................................................................................................................... + sqrdmulh v16.4S, v18.4S, v11.4S // ...............................................e................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v19.4S, v30.4S, v7.4S // .......................................................e........................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v15.4S, v12.4S, v10.4S // ..............................................................e................................................................................. + trn2 v4.4S, v12.4S, v10.4S // ...............................................................e................................................................................ + mul v27.4S, v29.4S, v13.4S // ...................................................e............................................................................................ + sqrdmulh v11.4S, v29.4S, v11.4S // ....................................................e........................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v2.4S, v23.4S, v6.4S // .........................................................e...................................................................................... + trn1 v24.4S, v23.4S, v6.4S // ........................................................e....................................................................................... + mls v5.4S, v16.4S, v8.S[0] // ................................................e............................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v27.4S, v11.4S, v8.S[0] // .....................................................e.......................................................................................... + trn2 v20.2D, v25.2D, v24.2D // ..........................................................e..................................................................................... + trn1 v18.2D, v25.2D, v24.2D // ............................................................e................................................................................... + trn2 v0.2D, v19.2D, v2.2D // ...........................................................e.................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v13.2D, v19.2D, v2.2D // .............................................................e.................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v24.4S, v20.4S, v0.4S // ...............................................................................e................................................................ + add v29.4S, v20.4S, v0.4S // ................................................................................e............................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v2.4S, v5.4S, v27.4S // .................................................................e.............................................................................. + add v1.4S, v18.4S, v13.4S // ...........................................................................e.................................................................... + sub v18.4S, v18.4S, v13.4S // ..........................................................................e..................................................................... + trn1 v3.4S, v5.4S, v27.4S // ................................................................e............................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v17.4S, v24.4S, v21.S[0] // .................................................................................e.............................................................. + sqrdmulh v22.4S, v24.4S, v21.S[1] // ..................................................................................e............................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v26.2D, v4.2D, v2.2D // ...................................................................e............................................................................ + trn1 v19.2D, v4.2D, v2.2D // .....................................................................e.......................................................................... + trn2 v25.2D, v15.2D, v3.2D // ..................................................................e............................................................................. + trn1 v30.2D, v15.2D, v3.2D // ....................................................................e........................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v24.4S, v18.4S, v31.S[3] // .............................................................................e.................................................................. + mul v16.4S, v18.4S, v31.S[2] // ............................................................................e................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v10.4S, v30.4S, v19.4S // ....................................................................................e........................................................... + sub v3.4S, v1.4S, v29.4S // ..............................................................................................e................................................. + add v27.4S, v30.4S, v19.4S // .....................................................................................e.......................................................... + sub v2.4S, v25.4S, v26.4S // .........................................................................................e...................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v0.4S, v25.4S, v26.4S // ..........................................................................................e..................................................... + mls v17.4S, v22.4S, v8.S[0] // ...................................................................................e............................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v20.4S, v10.4S, v21.S[2] // ......................................................................................e......................................................... + sqrdmulh v28.4S, v10.4S, v21.S[3] // .......................................................................................e........................................................ + sqrdmulh v11.4S, v2.4S, v14.S[1] // ............................................................................................e................................................... + mul v7.4S, v2.4S, v14.S[0] // ...........................................................................................e.................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v16.4S, v24.4S, v8.S[0] // ..............................................................................e................................................................. + sqrdmulh v10.4S, v3.4S, v9.S[3] // .................................................................................................e.............................................. + add v2.4S, v27.4S, v0.4S // .........................................................................................................e...................................... + add v14.4S, v1.4S, v29.4S // ...............................................................................................e................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v19.4S, v3.4S, v9.S[2] // ................................................................................................e............................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v20.4S, v28.4S, v8.S[0] // ........................................................................................e....................................................... + sub v29.4S, v14.4S, v2.4S // ..................................................................................................................e............................. + add v3.4S, v14.4S, v2.4S // ...................................................................................................................e............................ + mls v7.4S, v11.4S, v8.S[0] // .............................................................................................e.................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v14.4S, v16.4S, v17.4S // ...................................................................................................e............................................ + add v21.4S, v16.4S, v17.4S // ....................................................................................................e........................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v19.4S, v10.4S, v8.S[0] // ..................................................................................................e............................................. + sub v10.4S, v27.4S, v0.4S // ........................................................................................................e....................................... + sqrdmulh v2.4S, v29.4S, v9.S[1] // .....................................................................................................................e.......................... + mul v5.4S, v29.4S, v9.S[0] // ....................................................................................................................e........................... + str q3, [x1], #(16*4) // ......................................................................................................................................e......... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v1.4S, v20.4S, v7.4S // .............................................................................................................e.................................. + add v18.4S, v20.4S, v7.4S // ..............................................................................................................e................................. + mul v22.4S, v14.4S, v9.S[2] // .....................................................................................................e.......................................... + sqrdmulh v25.4S, v14.4S, v9.S[3] // ......................................................................................................e......................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v4.4S, v10.4S, v31.S[0] // ..........................................................................................................e..................................... + sqrdmulh v29.4S, v10.4S, v31.S[1] // ...........................................................................................................e.................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v28.4S, v1.4S, v31.S[1] // ................................................................................................................e............................... + mul v3.4S, v1.4S, v31.S[0] // ...............................................................................................................e................................ + sub v0.4S, v21.4S, v18.4S // .......................................................................................................................e........................ + add v7.4S, v21.4S, v18.4S // ........................................................................................................................e....................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v22.4S, v25.4S, v8.S[0] // .......................................................................................................e........................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v5.4S, v2.4S, v8.S[0] // ......................................................................................................................e......................... + mul v24.4S, v0.4S, v9.S[0] // .........................................................................................................................e...................... + sqrdmulh v10.4S, v0.4S, v9.S[1] // ..........................................................................................................................e..................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v3.4S, v28.4S, v8.S[0] // .................................................................................................................e.............................. + mls v4.4S, v29.4S, v8.S[0] // ............................................................................................................e................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q5, [x2], #(16*4) // ..........................................................................................................................................e..... + mls v24.4S, v10.4S, v8.S[0] // ...........................................................................................................................e.................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + + // original source code + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // ...................e............................................................................................................................|..................e............. + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // ..................................e.............................................................................................................|................................ + // ldr q0, [x5], #(12*16) // ............................e...................................................................................................................|...........................e.... + // ldr q4, [x5, #(-12*16 + 1*16)] // ......................e.........................................................................................................................|.....................e.......... + // ldr q1, [x5, #(-12*16 + 2*16)] // ...............e................................................................................................................................|..............e................. + // ldr q5, [x5, #(-12*16 + 3*16)] // .....................e..........................................................................................................................|....................e........... + // ldr q2, [x5, #(-12*16 + 4*16)] // ....................e...........................................................................................................................|...................e............ + // ldr q6, [x5, #(-12*16 + 5*16)] // .............................e..................................................................................................................|............................e... + // sub v24.4s, v9.4s, v10.4s // ...................................e............................................................................................................|................................ + // add v9.4s, v9.4s, v10.4s // ....................................e...........................................................................................................|................................ + // mul v10.4s, v24.4s, v1.4s // ............................................e...................................................................................................|................................ + // sqrdmulh v24.4s, v24.4s, v5.4s // .............................................e..................................................................................................|................................ + // mls v10.4s, v24.4s, v8.s[0] // ...................................................e............................................................................................|................................ + // sub v24.4s, v11.4s, v12.4s // .....................................e..........................................................................................................|................................ + // add v11.4s, v11.4s, v12.4s // ......................................e.........................................................................................................|................................ + // mul v12.4s, v24.4s, v2.4s // ..........................................e.....................................................................................................|................................ + // sqrdmulh v24.4s, v24.4s, v6.4s // ...........................................e....................................................................................................|................................ + // mls v12.4s, v24.4s, v8.s[0] // ................................................e...............................................................................................|................................ + // sub v24.4s, v9.4s, v11.4s // ........................................e.......................................................................................................|................................ + // add v9.4s, v9.4s, v11.4s // .........................................e......................................................................................................|................................ + // mul v11.4s, v24.4s, v0.4s // ..............................................e.................................................................................................|................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................................e................................................................................................|................................ + // mls v11.4s, v24.4s, v8.s[0] // .....................................................e..........................................................................................|................................ + // sub v24.4s, v10.4s, v12.4s // .............................................................e..................................................................................|................................ + // add v10.4s, v10.4s, v12.4s // ..............................................................e.................................................................................|................................ + // mul v12.4s, v24.4s, v0.4s // ...............................................................e................................................................................|................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // ................................................................e...............................................................................|................................ + // mls v12.4s, v24.4s, v8.s[0] // .......................................................................e........................................................................|................................ + // ldr q0, [x5, #(-12*16 + 6*16)] // ......e.........................................................................................................................................|.....e.......................... + // ldr q4, [x5, #(-12*16 + 7*16)] // .......e........................................................................................................................................|......e......................... + // ldr q1, [x5, #(-12*16 + 8*16)] // .......................................e........................................................................................................|................................ + // ldr q5, [x5, #(-12*16 + 9*16)] // ........e.......................................................................................................................................|.......e........................ + // ldr q2, [x5, #(-12*16 + 10*16)] // .........................e......................................................................................................................|........................e....... + // ldr q6, [x5, #(-12*16 + 11*16)] // .........e......................................................................................................................................|........e....................... + // sub v24.4s, v13.4s, v14.4s // ..................................................e.............................................................................................|................................ + // add v13.4s, v13.4s, v14.4s // ......................................................e.........................................................................................|................................ + // mul v14.4s, v24.4s, v1.4s // ..........................................................e.....................................................................................|................................ + // sqrdmulh v24.4s, v24.4s, v5.4s // .........................................................e......................................................................................|................................ + // mls v14.4s, v24.4s, v8.s[0] // .................................................................e..............................................................................|................................ + // sub v24.4s, v15.4s, v16.4s // .................................................e..............................................................................................|................................ + // add v15.4s, v15.4s, v16.4s // ....................................................e...........................................................................................|................................ + // mul v16.4s, v24.4s, v2.4s // .......................................................e........................................................................................|................................ + // sqrdmulh v24.4s, v24.4s, v6.4s // ........................................................e.......................................................................................|................................ + // mls v16.4s, v24.4s, v8.s[0] // ..................................................................e.............................................................................|................................ + // sub v24.4s, v13.4s, v15.4s // ...........................................................e....................................................................................|................................ + // add v13.4s, v13.4s, v15.4s // ............................................................e...................................................................................|................................ + // mul v15.4s, v24.4s, v0.4s // ....................................................................e...........................................................................|................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // ........................................................................e.......................................................................|................................ + // mls v15.4s, v24.4s, v8.s[0] // ................................................................................e...............................................................|................................ + // sub v24.4s, v14.4s, v16.4s // ......................................................................e.........................................................................|................................ + // add v14.4s, v14.4s, v16.4s // .....................................................................e..........................................................................|................................ + // mul v16.4s, v24.4s, v0.4s // ............................................................................e...................................................................|................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // .............................................................................e..................................................................|................................ + // mls v16.4s, v24.4s, v8.s[0] // .................................................................................e..............................................................|................................ + // trn1 v25.4s, v9.4s, v10.4s // ...................................................................e............................................................................|................................ + // trn2 v26.4s, v9.4s, v10.4s // .........................................................................e......................................................................|................................ + // trn1 v27.4s, v11.4s, v12.4s // ...............................................................................e................................................................|................................ + // trn2 v28.4s, v11.4s, v12.4s // ..............................................................................e.................................................................|................................ + // trn2 v11.2d, v25.2d, v27.2d // ..................................................................................e.............................................................|................................ + // trn2 v12.2d, v26.2d, v28.2d // ....................................................................................e...........................................................|................................ + // trn1 v9.2d, v25.2d, v27.2d // ...................................................................................e............................................................|................................ + // trn1 v10.2d, v26.2d, v28.2d // .....................................................................................e..........................................................|................................ + // trn1 v25.4s, v13.4s, v14.4s // ..........................................................................e.....................................................................|................................ + // trn2 v26.4s, v13.4s, v14.4s // ...........................................................................e....................................................................|................................ + // trn1 v27.4s, v15.4s, v16.4s // ...........................................................................................e....................................................|................................ + // trn2 v28.4s, v15.4s, v16.4s // ........................................................................................e.......................................................|................................ + // trn2 v15.2d, v25.2d, v27.2d // ................................................................................................e...............................................|................................ + // trn2 v16.2d, v26.2d, v28.2d // ..............................................................................................e.................................................|................................ + // trn1 v13.2d, v25.2d, v27.2d // .................................................................................................e..............................................|................................ + // trn1 v14.2d, v26.2d, v28.2d // ...............................................................................................e................................................|................................ + // ldr q0, [x4], #64 // .......................e........................................................................................................................|......................e......... + // ldr q1, [x4, #(-64 + 16)] // .....e..........................................................................................................................................|....e........................... + // ldr q2, [x4, #(-64 + 32)] // .................................e..............................................................................................................|................................ + // ldr q3, [x4, #(-64 + 48)] // e...............................................................................................................................................e................................ + // sub v24.4s, v9.4s, v10.4s // ..........................................................................................e.....................................................|................................ + // add v9.4s, v9.4s, v10.4s // .........................................................................................e......................................................|................................ + // mul v10.4s, v24.4s, v1.s[2] // ...................................................................................................e............................................|................................ + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..................................................................................................e.............................................|................................ + // mls v10.4s, v24.4s, v8.s[0] // ..............................................................................................................e.................................|................................ + // sub v24.4s, v11.4s, v12.4s // ......................................................................................e.........................................................|................................ + // add v11.4s, v11.4s, v12.4s // .......................................................................................e........................................................|................................ + // mul v12.4s, v24.4s, v2.s[0] // ............................................................................................e...................................................|................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .............................................................................................e..................................................|................................ + // mls v12.4s, v24.4s, v8.s[0] // .........................................................................................................e......................................|................................ + // sub v24.4s, v13.4s, v14.4s // ....................................................................................................e...........................................|................................ + // add v13.4s, v13.4s, v14.4s // ......................................................................................................e.........................................|................................ + // mul v14.4s, v24.4s, v2.s[2] // ..........................................................................................................e.....................................|................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...........................................................................................................e....................................|................................ + // mls v14.4s, v24.4s, v8.s[0] // ...................................................................................................................e............................|................................ + // sub v24.4s, v15.4s, v16.4s // .......................................................................................................e........................................|................................ + // add v15.4s, v15.4s, v16.4s // ........................................................................................................e.......................................|................................ + // mul v16.4s, v24.4s, v3.s[0] // .............................................................................................................e..................................|................................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ............................................................................................................e...................................|................................ + // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................................................e.........................|................................ + // sub v24.4s, v9.4s, v11.4s // .....................................................................................................e..........................................|................................ + // add v9.4s, v9.4s, v11.4s // .................................................................................................................e..............................|................................ + // mul v11.4s, v24.4s, v0.s[2] // ..................................................................................................................e.............................|................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...............................................................................................................e................................|................................ + // mls v11.4s, v24.4s, v8.s[0] // .........................................................................................................................e......................|................................ + // sub v24.4s, v10.4s, v12.4s // .......................................................................................................................e........................|................................ + // add v10.4s, v10.4s, v12.4s // ........................................................................................................................e.......................|................................ + // mul v12.4s, v24.4s, v0.s[2] // ................................................................................................................................e...............|................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .................................................................................................................................e..............|................................ + // mls v12.4s, v24.4s, v8.s[0] // ........................................................................................................................................e.......|................................ + // sub v24.4s, v13.4s, v15.4s // ..........................................................................................................................e.....................|................................ + // add v13.4s, v13.4s, v15.4s // ................................................................................................................e...............................|................................ + // mul v15.4s, v24.4s, v1.s[0] // ..................................................................................................................................e.............|................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................................................e............|................................ + // mls v15.4s, v24.4s, v8.s[0] // .............................................................................................................................................e..|................................ + // sub v24.4s, v14.4s, v16.4s // ..............................................................................................................................e.................|................................ + // add v14.4s, v14.4s, v16.4s // ...............................................................................................................................e................|................................ + // mul v16.4s, v24.4s, v1.s[0] // .....................................................................................................................................e..........|................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ....................................................................................................................................e...........|................................ + // mls v16.4s, v24.4s, v8.s[0] // ............................................................................................................................................e...|................................ + // sub v24.4s, v9.4s, v13.4s // ....................................................................................................................e...........................|................................ + // add v9.4s, v9.4s, v13.4s // .....................................................................................................................e..........................|................................ + // mul v13.4s, v24.4s, v0.s[0] // ............................................................................................................................e...................|................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................................................................................e....................|................................ + // mls v13.4s, v24.4s, v8.s[0] // .........................................................................................................................................e......|................................ + // sub v24.4s, v10.4s, v14.4s // ......................................................................................................................................e.........|................................ + // add v10.4s, v10.4s, v14.4s // .......................................................................................................................................e........|................................ + // mul v14.4s, v24.4s, v0.s[0] // ..........................................................................................................................................e.....|................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................................................................................................e....|................................ + // mls v14.4s, v24.4s, v8.s[0] // ...............................................................................................................................................e|................................ + // sub v24.4s, v11.4s, v15.4s // ...*............................................................................................................................................|..*............................. + // add v11.4s, v11.4s, v15.4s // ..*.............................................................................................................................................|.*.............................. + // mul v15.4s, v24.4s, v0.s[0] // .............*..................................................................................................................................|............*................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............*.................................................................................................................................|.............*.................. + // mls v15.4s, v24.4s, v8.s[0] // ..........................*.....................................................................................................................|.........................*...... + // sub v24.4s, v12.4s, v16.4s // ....*...........................................................................................................................................|...*............................ + // add v12.4s, v12.4s, v16.4s // .*..............................................................................................................................................|*............................... + // mul v16.4s, v24.4s, v0.s[0] // ............*...................................................................................................................................|...........*.................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................*..............................................................................................................................|................*............... + // mls v16.4s, v24.4s, v8.s[0] // ...........................*....................................................................................................................|..........................*..... + // str q9, [x1], #(16*4) // .............................................................................................................................e..................|................................ + // str q10, [x1, #(-16*4 + 1*16)] // ..........*.....................................................................................................................................|.........*...................... + // str q11, [x1, #(-16*4 + 2*16)] // ...........*....................................................................................................................................|..........*..................... + // str q12, [x1, #(-16*4 + 3*16)] // ................*...............................................................................................................................|...............*................ + // str q13, [x2], #(16*4) // ..............................................................................................................................................e.|................................ + // str q14, [x2, #(-16*4 + 1*16)] // ........................*.......................................................................................................................|.......................*........ + // str q15, [x2, #(-16*4 + 2*16)] // ...............................*................................................................................................................|..............................*. + // str q16, [x2, #(-16*4 + 3*16)] // ..............................*.................................................................................................................|.............................*.. + // add x1, x1, #64 // ..................*.............................................................................................................................|.................*.............. + // add x2, x2, #64 // ................................*...............................................................................................................|...............................* + + sub count, count, #1 + cbnz count, layer45678_start + sub v17.4S, v19.4S, v4.4S // ..*............... + add v31.4S, v22.4S, v3.4S // *................. + sub v6.4S, v22.4S, v3.4S // ...*.............. + str q24, [x2, #-48] // ............*..... + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + str q7, [x1, #-48] // ....*............. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + mul v26.4S, v17.4S, v9.S[0] // .......*.......... + sqrdmulh v0.4S, v17.4S, v9.S[1] // ........*......... + str q31, [x1, #-16] // .........*........ + mul v22.4S, v6.4S, v9.S[0] // ......*........... + sqrdmulh v28.4S, v6.4S, v9.S[1] // ..........*....... + // gap // .................. + // gap // .................. + // gap // .................. + add v17.4S, v19.4S, v4.4S // .*................ + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + mls v26.4S, v0.4S, v8.S[0] // .............*.... + mls v22.4S, v28.4S, v8.S[0] // ..............*... + str q17, [x1, #-32] // .....*............ + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + add x1, x1, #64 // ...........*...... + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + str q26, [x2, #-32] // ................*. + str q22, [x2, #-16] // ...............*.. + add x2, x2, #64 // .................* + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + + // original source code + // add v18.4S, v22.4S, v3.4S // .*................ + // add v16.4S, v19.4S, v4.4S // ..........*....... + // sub v2.4S, v19.4S, v4.4S // *................. + // sub v12.4S, v22.4S, v3.4S // ..*............... + // str q7, [x1, #-48] // ....*............. + // str q16, [x1, #-32] // .............*.... + // mul v10.4S, v12.4S, v9.S[0] // ........*......... + // mul v3.4S, v2.4S, v9.S[0] // .....*............ + // sqrdmulh v2.4S, v2.4S, v9.S[1] // ......*........... + // str q18, [x1, #-16] // .......*.......... + // sqrdmulh v15.4S, v12.4S, v9.S[1] // .........*........ + // add x1, x1, #64 // ..............*... + // str q24, [x2, #-48] // ...*.............. + // mls v3.4S, v2.4S, v8.S[0] // ...........*...... + // mls v10.4S, v15.4S, v8.S[0] // ............*..... + // str q10, [x2, #-16] // ................*. + // str q3, [x2, #-32] // ...............*.. + // add x2, x2, #64 // .................* + + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + ldr q19, [x0, #512] // .......*.................................................. + ldr q18, [x0, #384] // ..*....................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + ldr q9, [x0, #256] // *......................................................... + ldr q13, [x0, #0] // ......*................................................... + ldr q27, [x0, #128] // ....*..................................................... + ldr q15, [x0, #640] // ...*...................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + ldr q14, [x0, #768] // .....*.................................................... + ldr q12, [x0, #896] // .*........................................................ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v6.4S, v9.4S, v18.4S // .........*................................................ + add v9.4S, v9.4S, v18.4S // ........*................................................. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + add v24.4S, v13.4S, v27.4S // .................*........................................ + sub v29.4S, v19.4S, v15.4S // ..............*........................................... + add v10.4S, v19.4S, v15.4S // ................*......................................... + sub v31.4S, v13.4S, v27.4S // ...........*.............................................. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v19.4S, v6.4S, v2.S[0] // ...............*.......................................... + sqrdmulh v18.4S, v6.4S, v2.S[1] // ............*............................................. + add v13.4S, v14.4S, v12.4S // ..........*............................................... + sub v5.4S, v14.4S, v12.4S // .............*............................................ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + add v4.4S, v24.4S, v9.4S // ........................*................................. + sub v15.4S, v24.4S, v9.4S // ...........................*.............................. + sqrdmulh v12.4S, v29.4S, v2.S[3] // .....................*.................................... + mul v7.4S, v29.4S, v2.S[2] // .........................*................................ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + add v22.4S, v10.4S, v13.4S // .......................*.................................. + sub v13.4S, v10.4S, v13.4S // ..........................*............................... + sqrdmulh v10.4S, v31.4S, v1.S[3] // ...................*...................................... + mul v23.4S, v31.4S, v1.S[2] // ....................*..................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v30.4S, v5.4S, v3.S[1] // ..................*....................................... + mul v9.4S, v5.4S, v3.S[0] // ......................*................................... + sqrdmulh v29.4S, v15.4S, v0.S[3] // ......................................*................... + mul v15.4S, v15.4S, v0.S[2] // .....................................*.................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mls v7.4S, v12.4S, v8.S[0] // ..................................*....................... + sub v20.4S, v4.4S, v22.4S // .............................*............................ + add v17.4S, v4.4S, v22.4S // ....................................................*..... + sqrdmulh v27.4S, v13.4S, v1.S[1] // ...............................*.......................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v6.4S, v13.4S, v1.S[0] // ................................*......................... + mls v19.4S, v18.4S, v8.S[0] // ..............................*........................... + mls v23.4S, v10.4S, v8.S[0] // ............................*............................. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v14.4S, v20.4S, v0.S[0] // ....................................*..................... + sqrdmulh v24.4S, v20.4S, v0.S[1] // ...................................*...................... + mls v9.4S, v30.4S, v8.S[0] // .................................*........................ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mls v6.4S, v27.4S, v8.S[0] // .......................................*.................. + sub v16.4S, v23.4S, v19.4S // .........................................*................ + add v28.4S, v23.4S, v19.4S // ........................................*................. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + add v11.4S, v7.4S, v9.4S // ...........................................*.............. + sub v21.4S, v7.4S, v9.4S // ..........................................*............... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mls v14.4S, v24.4S, v8.S[0] // ............................................*............. + mul v24.4S, v16.4S, v0.S[2] // ..............................................*........... + sqrdmulh v9.4S, v16.4S, v0.S[3] // ...............................................*.......... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mls v15.4S, v29.4S, v8.S[0] // .............................................*............ + sub v12.4S, v28.4S, v11.4S // ..................................................*....... + add v30.4S, v28.4S, v11.4S // ...................................................*...... + mul v13.4S, v21.4S, v1.S[0] // ................................................*......... + sqrdmulh v22.4S, v21.4S, v1.S[1] // .................................................*........ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + srshr v4.4S, v14.4S, #23 // .....................................................*.... + mls v24.4S, v9.4S, v8.S[0] // ......................................................*... + mul v10.4S, v12.4S, v0.S[0] // ........................................................*. + sqrdmulh v21.4S, v12.4S, v0.S[1] // .........................................................* + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v28.4S, v15.4S, v6.4S // .......................................................*.. + + // original source code + // ldr q11, [x0, #256] // ..*....................................................... + // ldr q19, [x0, #896] // .......*.................................................. + // ldr q12, [x0, #384] // .*........................................................ + // ldr q16, [x0, #640] // .....*.................................................... + // ldr q7, [x0, #128] // ....*..................................................... + // ldr q9, [x0, #768] // ......*................................................... + // ldr q6, [x0, #0] // ...*...................................................... + // ldr q15, [x0, #512] // *......................................................... + // add v21.4S, v11.4S, v12.4S // .........*................................................ + // sub v12.4S, v11.4S, v12.4S // ........*................................................. + // add v24.4S, v9.4S, v19.4S // ................*......................................... + // sub v13.4S, v6.4S, v7.4S // .............*............................................ + // sqrdmulh v14.4S, v12.4S, v2.S[1] // ...............*.......................................... + // sub v9.4S, v9.4S, v19.4S // .................*........................................ + // sub v27.4S, v15.4S, v16.4S // ...........*.............................................. + // mul v30.4S, v12.4S, v2.S[0] // ..............*........................................... + // add v16.4S, v15.4S, v16.4S // ............*............................................. + // add v6.4S, v6.4S, v7.4S // ..........*............................................... + // sqrdmulh v12.4S, v9.4S, v3.S[1] // ..........................*............................... + // sqrdmulh v28.4S, v13.4S, v1.S[3] // ........................*................................. + // mul v7.4S, v13.4S, v1.S[2] // .........................*................................ + // sqrdmulh v15.4S, v27.4S, v2.S[3] // ....................*..................................... + // mul v13.4S, v9.4S, v3.S[0] // ...........................*.............................. + // add v19.4S, v16.4S, v24.4S // ......................*................................... + // add v9.4S, v6.4S, v21.4S // ..................*....................................... + // mul v27.4S, v27.4S, v2.S[2] // .....................*.................................... + // sub v16.4S, v16.4S, v24.4S // .......................*.................................. + // sub v17.4S, v6.4S, v21.4S // ...................*...................................... + // mls v7.4S, v28.4S, v8.S[0] // ....................................*..................... + // sub v24.4S, v9.4S, v19.4S // ...............................*.......................... + // mls v30.4S, v14.4S, v8.S[0] // ...................................*...................... + // sqrdmulh v21.4S, v16.4S, v1.S[1] // .................................*........................ + // mul v6.4S, v16.4S, v1.S[0] // ..................................*....................... + // mls v13.4S, v12.4S, v8.S[0] // .......................................*.................. + // mls v27.4S, v15.4S, v8.S[0] // ..............................*........................... + // sqrdmulh v29.4S, v24.4S, v0.S[1] // ......................................*................... + // mul v14.4S, v24.4S, v0.S[0] // .....................................*.................... + // mul v15.4S, v17.4S, v0.S[2] // .............................*............................ + // sqrdmulh v28.4S, v17.4S, v0.S[3] // ............................*............................. + // mls v6.4S, v21.4S, v8.S[0] // ........................................*................. + // add v5.4S, v7.4S, v30.4S // ..........................................*............... + // sub v20.4S, v7.4S, v30.4S // .........................................*................ + // sub v10.4S, v27.4S, v13.4S // ............................................*............. + // add v16.4S, v27.4S, v13.4S // ...........................................*.............. + // mls v14.4S, v29.4S, v8.S[0] // .............................................*............ + // mls v15.4S, v28.4S, v8.S[0] // ................................................*......... + // mul v24.4S, v20.4S, v0.S[2] // ..............................................*........... + // sqrdmulh v12.4S, v20.4S, v0.S[3] // ...............................................*.......... + // mul v13.4S, v10.4S, v1.S[0] // ...................................................*...... + // sqrdmulh v22.4S, v10.4S, v1.S[1] // ....................................................*..... + // sub v21.4S, v5.4S, v16.4S // .................................................*........ + // add v30.4S, v5.4S, v16.4S // ..................................................*....... + // add v17.4S, v9.4S, v19.4S // ................................*......................... + // srshr v4.4S, v14.4S, #23 // .....................................................*.... + // mls v24.4S, v12.4S, v8.S[0] // ......................................................*... + // sub v28.4S, v15.4S, v6.4S // .........................................................* + // mul v10.4S, v21.4S, v0.S[0] // .......................................................*.. + // sqrdmulh v21.4S, v21.4S, v0.S[1] // ........................................................*. + + sub count, count, #1 +layer123_start: + ldr q11, [x0, #272] // ..e............................................................................................. + ldr q19, [x0, #912] // .......e........................................................................................ + ldr q12, [x0, #400] // ...e............................................................................................ + mls v13.4S, v22.4S, v8.S[0] // ...............................................*................................................ + sqrdmulh v29.4S, v17.4S, v26.4S // .................................................................................*.............. + mul v23.4S, v17.4S, v25.4S // ................................................................................*............... + // gap // ................................................................................................ + sqrdmulh v22.4S, v30.4S, v26.4S // ....................................................................................*........... + ldr q16, [x0, #656] // .....e.......................................................................................... + ldr q7, [x0, #144] // .e.............................................................................................. + mul v18.4S, v28.4S, v0.S[0] // ............................................................*................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v14.4S, v4.4S, v8.4S // .....................................................................*.......................... + add v20.4S, v15.4S, v6.4S // ...........................................................*.................................... + mul v27.4S, v30.4S, v25.4S // ...................................................................................*............ + ldr q9, [x0, #784] // ......e......................................................................................... + ldr q6, [x0, #16] // e............................................................................................... + sqrdmulh v17.4S, v28.4S, v0.S[1] // .............................................................*.................................. + mls v10.4S, v21.4S, v8.S[0] // .........................................................*...................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + ldr q15, [x0, #528] // ....e........................................................................................... + mul v31.4S, v20.4S, v25.4S // ......................................................................................*......... + add v5.4S, v24.4S, v13.4S // ................................................................*............................... + sqrdmulh v20.4S, v20.4S, v26.4S // .......................................................................................*........ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sub v28.4S, v24.4S, v13.4S // ...............................................................*................................ + str q14, [x0, #512] // ............................................................................*................... + add v21.4S, v11.4S, v12.4S // ..............e................................................................................. + sub v12.4S, v11.4S, v12.4S // .............e.................................................................................. + mls v27.4S, v22.4S, v8.S[0] // .....................................................................................*.......... + mls v23.4S, v29.4S, v8.S[0] // ..................................................................................*............. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v22.4S, v28.4S, v0.S[0] // .................................................................*.............................. + // gap // ................................................................................................ + sqrdmulh v4.4S, v5.4S, v26.4S // ..........................................................................................*..... + srshr v29.4S, v10.4S, #23 // ......................................................................*......................... + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v11.4S, v28.4S, v0.S[1] // ..................................................................*............................. + // gap // ................................................................................................ + add v24.4S, v9.4S, v19.4S // ........................e....................................................................... + // gap // ................................................................................................ + sub v13.4S, v6.4S, v7.4S // ........e....................................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v14.4S, v12.4S, v2.S[1] // ................e............................................................................... + // gap // ................................................................................................ + sub v9.4S, v9.4S, v19.4S // .......................e........................................................................ + str q27, [x0, #128] // .............................................................................................*.. + sub v27.4S, v15.4S, v16.4S // ..................e............................................................................. + mul v30.4S, v12.4S, v2.S[0] // ...............e................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + add v16.4S, v15.4S, v16.4S // ...................e............................................................................ + add v6.4S, v6.4S, v7.4S // .........e...................................................................................... + sqrdmulh v12.4S, v9.4S, v3.S[1] // ..........................e..................................................................... + mls v22.4S, v11.4S, v8.S[0] // ...................................................................*............................ + sqrdmulh v28.4S, v13.4S, v1.S[3] // ...........e.................................................................................... + mul v7.4S, v13.4S, v1.S[2] // ..........e..................................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v15.4S, v27.4S, v2.S[3] // .....................e.......................................................................... + mul v13.4S, v9.4S, v3.S[0] // .........................e...................................................................... + add v19.4S, v16.4S, v24.4S // .......................................e........................................................ + add v9.4S, v6.4S, v21.4S // .............................e.................................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v27.4S, v27.4S, v2.S[2] // ....................e........................................................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v18.4S, v17.4S, v8.S[0] // ..............................................................*................................. + sub v16.4S, v16.4S, v24.4S // ......................................e......................................................... + sub v17.4S, v6.4S, v21.4S // ............................e................................................................... + mls v7.4S, v28.4S, v8.S[0] // ............e................................................................................... + sub v24.4S, v9.4S, v19.4S // ................................................e............................................... + srshr v11.4S, v22.4S, #23 // ..........................................................................*..................... + mls v30.4S, v14.4S, v8.S[0] // .................e.............................................................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v21.4S, v16.4S, v1.S[1] // .........................................e...................................................... + mul v6.4S, v16.4S, v1.S[0] // ........................................e....................................................... + mls v13.4S, v12.4S, v8.S[0] // ...........................e.................................................................... + mls v10.4S, v29.4S, v8.4S // .......................................................................*........................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v27.4S, v15.4S, v8.S[0] // ......................e......................................................................... + sqrdmulh v29.4S, v24.4S, v0.S[1] // ...................................................e............................................ + mul v14.4S, v24.4S, v0.S[0] // ..................................................e............................................. + str q23, [x0], #(16) // ............................................................................................*... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + srshr v12.4S, v18.4S, #23 // ........................................................................*....................... + mls v22.4S, v11.4S, v8.4S // ...........................................................................*.................... + mul v15.4S, v17.4S, v0.S[2] // ..............................e................................................................. + mul v11.4S, v5.4S, v25.4S // .........................................................................................*...... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + sqrdmulh v28.4S, v17.4S, v0.S[3] // ...............................e................................................................ + // gap // ................................................................................................ + str q10, [x0, #624] // .............................................................................*.................. + mls v6.4S, v21.4S, v8.S[0] // ..........................................e..................................................... + mls v31.4S, v20.4S, v8.S[0] // ........................................................................................*....... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + add v5.4S, v7.4S, v30.4S // ..................................e............................................................. + sub v20.4S, v7.4S, v30.4S // .................................e.............................................................. + sub v10.4S, v27.4S, v13.4S // ...........................................e.................................................... + add v16.4S, v27.4S, v13.4S // ............................................e................................................... + mls v14.4S, v29.4S, v8.S[0] // ....................................................e........................................... + mls v18.4S, v12.4S, v8.4S // .........................................................................*...................... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mls v15.4S, v28.4S, v8.S[0] // ................................e............................................................... + mul v24.4S, v20.4S, v0.S[2] // ...................................e............................................................ + sqrdmulh v12.4S, v20.4S, v0.S[3] // ....................................e........................................................... + str q22, [x0, #880] // ...............................................................................*................ + mls v11.4S, v4.4S, v8.S[0] // ...........................................................................................*.... + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + mul v13.4S, v10.4S, v1.S[0] // .............................................e.................................................. + sqrdmulh v22.4S, v10.4S, v1.S[1] // ..............................................e................................................. + sub v21.4S, v5.4S, v16.4S // .....................................................e.......................................... + add v30.4S, v5.4S, v16.4S // ......................................................e......................................... + // gap // ................................................................................................ + str q31, [x0, #240] // ..............................................................................................*. + // gap // ................................................................................................ + // gap // ................................................................................................ + str q18, [x0, #752] // ..............................................................................*................. + // gap // ................................................................................................ + // gap // ................................................................................................ + // gap // ................................................................................................ + add v17.4S, v9.4S, v19.4S // .................................................e.............................................. + // gap // ................................................................................................ + // gap // ................................................................................................ + srshr v4.4S, v14.4S, #23 // ....................................................................e........................... + mls v24.4S, v12.4S, v8.S[0] // .....................................e.......................................................... + sub v28.4S, v15.4S, v6.4S // ..........................................................e..................................... + // gap // ................................................................................................ + mul v10.4S, v21.4S, v0.S[0] // .......................................................e........................................ + sqrdmulh v21.4S, v21.4S, v0.S[1] // ........................................................e....................................... + // gap // ................................................................................................ + // gap // ................................................................................................ + str q11, [x0, #368] // ...............................................................................................* + + // original source code + // ldr q9, [x0, #0] // ..............e.................................................................................|.............e................................................................................. + // ldr q10, [x0, #(1*(1024/8))] // ........e.......................................................................................|.......e....................................................................................... + // ldr q11, [x0, #(2*(1024/8))] // e...............................................................................................e............................................................................................... + // ldr q12, [x0, #(3*(1024/8))] // ..e.............................................................................................|.e............................................................................................. + // ldr q13, [x0, #(4*(1024/8))] // .................e..............................................................................|................e.............................................................................. + // ldr q14, [x0, #(5*(1024/8))] // .......e........................................................................................|......e........................................................................................ + // ldr q15, [x0, #(6*(1024/8))] // .............e..................................................................................|............e.................................................................................. + // ldr q16, [x0, #(7*(1024/8))] // .e..............................................................................................|e.............................................................................................. + // sub v24.4s, v9.4s, v10.4s // ................................e...............................................................|...............................e............................................................... + // add v9.4s, v9.4s, v10.4s // .......................................e........................................................|......................................e........................................................ + // mul v10.4s, v24.4s, v1.s[2] // ...........................................e....................................................|..........................................e.................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..........................................e.....................................................|.........................................e..................................................... + // mls v10.4s, v24.4s, v8.s[0] // ....................................................e...........................................|...................................................e........................................... + // sub v24.4s, v11.4s, v12.4s // ........................e.......................................................................|.......................e....................................................................... + // add v11.4s, v11.4s, v12.4s // .......................e........................................................................|......................e........................................................................ + // mul v12.4s, v24.4s, v2.s[0] // .....................................e..........................................................|....................................e.......................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .................................e..............................................................|................................e.............................................................. + // mls v12.4s, v24.4s, v8.s[0] // .......................................................e........................................|......................................................e........................................ + // sub v24.4s, v13.4s, v14.4s // ....................................e...........................................................|...................................e........................................................... + // add v13.4s, v13.4s, v14.4s // ......................................e.........................................................|.....................................e......................................................... + // mul v14.4s, v24.4s, v2.s[2] // ................................................e...............................................|...............................................e............................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ............................................e...................................................|...........................................e................................................... + // mls v14.4s, v24.4s, v8.s[0] // ............................................................e...................................|...........................................................e................................... + // sub v24.4s, v15.4s, v16.4s // ..................................e.............................................................|.................................e............................................................. + // add v15.4s, v15.4s, v16.4s // ...............................e................................................................|..............................e................................................................ + // mul v16.4s, v24.4s, v3.s[0] // .............................................e..................................................|............................................e.................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................e.......................................................|.......................................e....................................................... + // mls v16.4s, v24.4s, v8.s[0] // ..........................................................e.....................................|.........................................................e..................................... + // sub v24.4s, v9.4s, v11.4s // ...................................................e............................................|..................................................e............................................ + // add v9.4s, v9.4s, v11.4s // ...............................................e................................................|..............................................e................................................ + // mul v11.4s, v24.4s, v0.s[2] // ..................................................................e.............................|.................................................................e............................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ....................................................................e...........................|...................................................................e........................... + // mls v11.4s, v24.4s, v8.s[0] // ..............................................................................e.................|.............................................................................e................. + // sub v24.4s, v10.4s, v12.4s // .........................................................................e......................|........................................................................e...................... + // add v10.4s, v10.4s, v12.4s // ........................................................................e.......................|.......................................................................e....................... + // mul v12.4s, v24.4s, v0.s[2] // ...............................................................................e................|..............................................................................e................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................................................................................e...............|...............................................................................e............... + // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................................e....|..........................................................................................e.... + // sub v24.4s, v13.4s, v15.4s // ..................................................e.............................................|.................................................e............................................. + // add v13.4s, v13.4s, v15.4s // ..............................................e.................................................|.............................................e................................................. + // mul v15.4s, v24.4s, v1.s[0] // .........................................................e......................................|........................................................e...................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................................................e.......................................|.......................................................e....................................... + // mls v15.4s, v24.4s, v8.s[0] // ......................................................................e.........................|.....................................................................e......................... + // sub v24.4s, v14.4s, v16.4s // ..........................................................................e.....................|.........................................................................e..................... + // add v14.4s, v14.4s, v16.4s // ...........................................................................e....................|..........................................................................e.................... + // mul v16.4s, v24.4s, v1.s[0] // ...................................................................................e............|..................................................................................e............ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ....................................................................................e...........|...................................................................................e........... + // mls v16.4s, v24.4s, v8.s[0] // ...*............................................................................................|..*............................................................................................ + // sub v24.4s, v9.4s, v13.4s // .....................................................e..........................................|....................................................e.......................................... + // add v9.4s, v9.4s, v13.4s // .........................................................................................e......|........................................................................................e...... + // mul v13.4s, v24.4s, v0.s[0] // ..............................................................e.................................|.............................................................e................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................e..................................|............................................................e.................................. + // mls v13.4s, v24.4s, v8.s[0] // ............................................................................e...................|...........................................................................e................... + // sub v24.4s, v10.4s, v14.4s // .....................................................................................e..........|....................................................................................e.......... + // add v10.4s, v10.4s, v14.4s // ......................................................................................e.........|.....................................................................................e......... + // mul v14.4s, v24.4s, v0.s[0] // .............................................................................................e..|............................................................................................e.. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................................................................e.|.............................................................................................e. + // mls v14.4s, v24.4s, v8.s[0] // ................*...............................................................................|...............*............................................................................... + // sub v24.4s, v11.4s, v15.4s // ............................................................................................e...|...........................................................................................e... + // add v11.4s, v11.4s, v15.4s // ...........*....................................................................................|..........*.................................................................................... + // mul v15.4s, v24.4s, v0.s[0] // .........*......................................................................................|........*...................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............*................................................................................|..............*................................................................................ + // mls v15.4s, v24.4s, v8.s[0] // .................................................*..............................................|................................................*.............................................. + // sub v24.4s, v12.4s, v16.4s // .....................*..........................................................................|....................*.......................................................................... + // add v12.4s, v12.4s, v16.4s // ...................*............................................................................|..................*............................................................................ + // mul v16.4s, v24.4s, v0.s[0] // ...........................*....................................................................|..........................*.................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................*.................................................................|.............................*................................................................. + // mls v16.4s, v24.4s, v8.s[0] // .........................................*......................................................|........................................*...................................................... + // srshr v24.4S, v13.4S, #23 // ..........................................................................................e.....|.........................................................................................e..... + // mls v13.4s, v24.4s, v8.4s // ..........*.....................................................................................|.........*..................................................................................... + // srshr v24.4S, v14.4S, #23 // .............................*..................................................................|............................*.................................................................. + // mls v14.4s, v24.4s, v8.4s // ...........................................................*....................................|..........................................................*.................................... + // srshr v24.4S, v15.4S, #23 // ................................................................*...............................|...............................................................*............................... + // mls v15.4s, v24.4s, v8.4s // .............................................................................*..................|............................................................................*.................. + // srshr v24.4S, v16.4S, #23 // ......................................................*.........................................|.....................................................*......................................... + // mls v16.4s, v24.4s, v8.4s // .................................................................*..............................|................................................................*.............................. + // str q13, [x0, #(4*(1024/8))] // ......................*.........................................................................|.....................*......................................................................... + // str q14, [x0, #(5*(1024/8))] // .....................................................................*..........................|....................................................................*.......................... + // str q15, [x0, #(6*(1024/8))] // ........................................................................................*.......|.......................................................................................*....... + // str q16, [x0, #(7*(1024/8))] // .................................................................................*..............|................................................................................*.............. + // mul v13.4s, v9.4s, v25.4s // .....*..........................................................................................|....*.......................................................................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ....*...........................................................................................|...*........................................................................................... + // mls v13.4s, v9.4s, v8.s[0] // ..........................*.....................................................................|.........................*..................................................................... + // mul v14.4s, v10.4s, v25.4s // ............*...................................................................................|...........*................................................................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ......*.........................................................................................|.....*......................................................................................... + // mls v14.4s, v10.4s, v8.s[0] // .........................*......................................................................|........................*...................................................................... + // mul v15.4s, v11.4s, v25.4s // ..................*.............................................................................|.................*............................................................................. + // sqrdmulh v11.4s, v11.4s, v26.4s // ....................*...........................................................................|...................*........................................................................... + // mls v15.4s, v11.4s, v8.s[0] // .......................................................................*........................|......................................................................*........................ + // mul v16.4s, v12.4s, v25.4s // ...................................................................*............................|..................................................................*............................ + // sqrdmulh v12.4s, v12.4s, v26.4s // ............................*...................................................................|...........................*................................................................... + // mls v16.4s, v12.4s, v8.s[0] // ..................................................................................*.............|.................................................................................*............. + // str q13, [x0], #(16) // ...............................................................*................................|..............................................................*................................ + // str q14, [x0, #(-16 + 1*(1024/8))] // ...................................*............................................................|..................................*............................................................ + // str q15, [x0, #(-16 + 2*(1024/8))] // .......................................................................................*........|......................................................................................*........ + // str q16, [x0, #(-16 + 3*(1024/8))] // ...............................................................................................*|..............................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + mul v9.4S, v30.4S, v25.4S // .......*.............................. + sqrdmulh v31.4S, v17.4S, v26.4S // .*.................................... + sqrdmulh v7.4S, v30.4S, v26.4S // ...*.................................. + mls v13.4S, v22.4S, v8.S[0] // *..................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v22.4S, v17.4S, v25.4S // ..*................................... + mul v5.4S, v28.4S, v0.S[0] // ....*................................. + sqrdmulh v27.4S, v28.4S, v0.S[1] // ........*............................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v14.4S, v4.4S, v8.4S // .....*................................ + mls v10.4S, v21.4S, v8.S[0] // .........*............................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v9.4S, v7.4S, v8.S[0] // ...............*...................... + sub v19.4S, v24.4S, v13.4S // .............*........................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + add v17.4S, v24.4S, v13.4S // ...........*.......................... + // gap // ...................................... + mls v22.4S, v31.4S, v8.S[0] // ................*..................... + mls v5.4S, v27.4S, v8.S[0] // .......................*.............. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q14, [x0, #512] // ..............*....................... + mul v31.4S, v19.4S, v0.S[0] // .................*.................... + sqrdmulh v4.4S, v19.4S, v0.S[1] // ....................*................. + sqrdmulh v27.4S, v17.4S, v26.4S // ..................*................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q9, [x0, #128] // .....................*................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + add v7.4S, v15.4S, v6.4S // ......*............................... + // gap // ...................................... + // gap // ...................................... + srshr v15.4S, v10.4S, #23 // ...................*.................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v18.4S, v17.4S, v25.4S // .............................*........ + str q22, [x0], #(16) // ..........................*........... + mls v31.4S, v4.4S, v8.S[0] // ......................*............... + srshr v20.4S, v5.4S, #23 // ...........................*.......... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v23.4S, v7.4S, v25.4S // ..........*........................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v12.4S, v7.4S, v26.4S // ............*......................... + mls v10.4S, v15.4S, v8.4S // .........................*............ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v18.4S, v27.4S, v8.S[0] // ..................................*... + srshr v6.4S, v31.4S, #23 // ........................*............. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v5.4S, v20.4S, v8.4S // ................................*..... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v23.4S, v12.4S, v8.S[0] // ...............................*...... + str q10, [x0, #624] // ..............................*....... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v31.4S, v6.4S, v8.4S // ............................*......... + str q18, [x0, #368] // .....................................* + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q5, [x0, #752] // ....................................*. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q23, [x0, #240] // ...................................*.. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q31, [x0, #880] // .................................*.... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + + // original source code + // mls v13.4S, v22.4S, v8.S[0] // ...*.................................. + // sqrdmulh v29.4S, v17.4S, v26.4S // .*.................................... + // mul v23.4S, v17.4S, v25.4S // ....*................................. + // sqrdmulh v22.4S, v30.4S, v26.4S // ..*................................... + // mul v18.4S, v28.4S, v0.S[0] // .....*................................ + // mls v14.4S, v4.4S, v8.4S // .......*.............................. + // add v20.4S, v15.4S, v6.4S // ...................*.................. + // mul v27.4S, v30.4S, v25.4S // *..................................... + // sqrdmulh v17.4S, v28.4S, v0.S[1] // ......*............................... + // mls v10.4S, v21.4S, v8.S[0] // ........*............................. + // mul v31.4S, v20.4S, v25.4S // .........................*............ + // add v5.4S, v24.4S, v13.4S // ...........*.......................... + // sqrdmulh v20.4S, v20.4S, v26.4S // ..........................*........... + // sub v28.4S, v24.4S, v13.4S // ..........*........................... + // str q14, [x0, #512] // ..............*....................... + // mls v27.4S, v22.4S, v8.S[0] // .........*............................ + // mls v23.4S, v29.4S, v8.S[0] // ............*......................... + // mul v22.4S, v28.4S, v0.S[0] // ...............*...................... + // sqrdmulh v4.4S, v5.4S, v26.4S // .................*.................... + // srshr v29.4S, v10.4S, #23 // ....................*................. + // sqrdmulh v11.4S, v28.4S, v0.S[1] // ................*..................... + // str q27, [x0, #128] // ..................*................... + // mls v22.4S, v11.4S, v8.S[0] // .......................*.............. + // mls v18.4S, v17.4S, v8.S[0] // .............*........................ + // srshr v11.4S, v22.4S, #23 // .............................*........ + // mls v10.4S, v29.4S, v8.4S // ...........................*.......... + // str q23, [x0], #(16) // ......................*............... + // srshr v12.4S, v18.4S, #23 // ........................*............. + // mls v22.4S, v11.4S, v8.4S // .................................*.... + // mul v11.4S, v5.4S, v25.4S // .....................*................ + // str q10, [x0, #624] // ................................*..... + // mls v31.4S, v20.4S, v8.S[0] // ...............................*...... + // mls v18.4S, v12.4S, v8.4S // ..............................*....... + // str q22, [x0, #880] // .....................................* + // mls v11.4S, v4.4S, v8.S[0] // ............................*......... + // str q31, [x0, #240] // ....................................*. + // str q18, [x0, #752] // ...................................*.. + // str q11, [x0, #368] // ..................................*... + + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/pqclean.h b/tests/ntt_dilithium/manual/pqclean.h index 84e997e..500b934 100644 --- a/tests/ntt_dilithium/manual/pqclean.h +++ b/tests/ntt_dilithium/manual/pqclean.h @@ -76,6 +76,6 @@ const int32_t streamlined_GS_itable_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + ( // #define ntt DILITHIUM_NAMESPACE(pqclean_ntt) void pqclean_ntt(int32_t a[ARRAY_N]); // #define pqclean_invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont) -void invntt_tomont(int32_t a[ARRAY_N]); +void pqclean_invntt_tomont(int32_t a[ARRAY_N]); #endif \ No newline at end of file From 7ecd33ac47aa196a2011e23486d0e513a6afeb53 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Sat, 23 Mar 2024 17:46:22 -0400 Subject: [PATCH 12/18] Add invNTT Kyber tests --- .../intt_kyber_123_4567_manual_ld4_opt_a55.s | 1 + .../intt_kyber_123_4567_manual_ld4_opt_a72.s | 1 + ...ber_123_4567_manual_ld4_opt_m1_firestorm.s | 1 + ...yber_123_4567_manual_ld4_opt_m1_icestorm.s | 1 + .../ntt_kyber/intt_kyber_123_4567_opt_a55.s | 1 + .../ntt_kyber/intt_kyber_123_4567_opt_a72.s | 1 + .../intt_kyber_123_4567_opt_m1_firestorm.s | 1 + .../intt_kyber_123_4567_opt_m1_icestorm.s | 1 + tests/ntt_kyber/main.c | 64 +- .../intt_kyber_123_4567_manual_ld4_opt_a55.s | 1482 +++++++++++++ .../intt_kyber_123_4567_manual_ld4_opt_a72.s | 1823 ++++++++++++++++ ...ber_123_4567_manual_ld4_opt_m1_firestorm.s | 1922 +++++++++++++++++ ...yber_123_4567_manual_ld4_opt_m1_icestorm.s | 1470 +++++++++++++ .../manual/intt_kyber_123_4567_opt_a55.s | 1516 +++++++++++++ .../manual/intt_kyber_123_4567_opt_a72.s | 1845 ++++++++++++++++ .../intt_kyber_123_4567_opt_m1_firestorm.s | 1728 +++++++++++++++ .../intt_kyber_123_4567_opt_m1_icestorm.s | 1420 ++++++++++++ 17 files changed, 13271 insertions(+), 7 deletions(-) create mode 120000 asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4_opt_a55.s create mode 120000 asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4_opt_a72.s create mode 120000 asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s create mode 120000 asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s create mode 120000 asm/manual/ntt_kyber/intt_kyber_123_4567_opt_a55.s create mode 120000 asm/manual/ntt_kyber/intt_kyber_123_4567_opt_a72.s create mode 120000 asm/manual/ntt_kyber/intt_kyber_123_4567_opt_m1_firestorm.s create mode 120000 asm/manual/ntt_kyber/intt_kyber_123_4567_opt_m1_icestorm.s create mode 100644 tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_a55.s create mode 100644 tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_a72.s create mode 100644 tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s create mode 100644 tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s create mode 100644 tests/ntt_kyber/manual/intt_kyber_123_4567_opt_a55.s create mode 100644 tests/ntt_kyber/manual/intt_kyber_123_4567_opt_a72.s create mode 100644 tests/ntt_kyber/manual/intt_kyber_123_4567_opt_m1_firestorm.s create mode 100644 tests/ntt_kyber/manual/intt_kyber_123_4567_opt_m1_icestorm.s diff --git a/asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4_opt_a55.s b/asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4_opt_a55.s new file mode 120000 index 0000000..d334a2c --- /dev/null +++ b/asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4_opt_a55.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_a55.s \ No newline at end of file diff --git a/asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4_opt_a72.s b/asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4_opt_a72.s new file mode 120000 index 0000000..a5d1a18 --- /dev/null +++ b/asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4_opt_a72.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_a72.s \ No newline at end of file diff --git a/asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s b/asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s new file mode 120000 index 0000000..ef3f0dd --- /dev/null +++ b/asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s \ No newline at end of file diff --git a/asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s b/asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s new file mode 120000 index 0000000..181fe62 --- /dev/null +++ b/asm/manual/ntt_kyber/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s \ No newline at end of file diff --git a/asm/manual/ntt_kyber/intt_kyber_123_4567_opt_a55.s b/asm/manual/ntt_kyber/intt_kyber_123_4567_opt_a55.s new file mode 120000 index 0000000..ca8fbf8 --- /dev/null +++ b/asm/manual/ntt_kyber/intt_kyber_123_4567_opt_a55.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_kyber_123_4567_opt_a55.s \ No newline at end of file diff --git a/asm/manual/ntt_kyber/intt_kyber_123_4567_opt_a72.s b/asm/manual/ntt_kyber/intt_kyber_123_4567_opt_a72.s new file mode 120000 index 0000000..d5a6fe5 --- /dev/null +++ b/asm/manual/ntt_kyber/intt_kyber_123_4567_opt_a72.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_kyber_123_4567_opt_a72.s \ No newline at end of file diff --git a/asm/manual/ntt_kyber/intt_kyber_123_4567_opt_m1_firestorm.s b/asm/manual/ntt_kyber/intt_kyber_123_4567_opt_m1_firestorm.s new file mode 120000 index 0000000..77f81da --- /dev/null +++ b/asm/manual/ntt_kyber/intt_kyber_123_4567_opt_m1_firestorm.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_kyber_123_4567_opt_m1_firestorm.s \ No newline at end of file diff --git a/asm/manual/ntt_kyber/intt_kyber_123_4567_opt_m1_icestorm.s b/asm/manual/ntt_kyber/intt_kyber_123_4567_opt_m1_icestorm.s new file mode 120000 index 0000000..9fc1c83 --- /dev/null +++ b/asm/manual/ntt_kyber/intt_kyber_123_4567_opt_m1_icestorm.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_kyber_123_4567_opt_m1_icestorm.s \ No newline at end of file diff --git a/tests/ntt_kyber/main.c b/tests/ntt_kyber/main.c index ef5833e..52bd561 100644 --- a/tests/ntt_kyber/main.c +++ b/tests/ntt_kyber/main.c @@ -55,12 +55,16 @@ void ntt_kyber_123_4567_opt_a55(int16_t *); void ntt_kyber_123_4567_scalar_load_opt_a55(int16_t *); void ntt_kyber_123_4567_scalar_load_store_opt_a55(int16_t *); void ntt_kyber_123_4567_scalar_store_opt_a55(int16_t *); +void intt_kyber_123_4567_opt_a55(int16_t *); +void intt_kyber_123_4567_manual_ld4_opt_a55(int16_t *); // A72 void ntt_kyber_123_4567_manual_st4_opt_a72(int16_t *); void ntt_kyber_123_4567_opt_a72(int16_t *); void ntt_kyber_123_4567_scalar_load_opt_a72(int16_t *); void ntt_kyber_123_4567_scalar_load_store_opt_a72(int16_t *); void ntt_kyber_123_4567_scalar_store_opt_a72(int16_t *); +void intt_kyber_123_4567_opt_a72(int16_t *); +void intt_kyber_123_4567_manual_ld4_opt_a72(int16_t *); // M1 Firestorm void ntt_kyber_123_4567_opt_m1_firestorm(int16_t *); void ntt_kyber_123_4567_scalar_load_opt_m1_firestorm(int16_t *); @@ -69,6 +73,8 @@ void ntt_kyber_123_4567_manual_st4_opt_m1_firestorm(int16_t *); void ntt_kyber_123_4567_scalar_store_opt_m1_firestorm(int16_t *); /* void ntt_kyber_1234_567_opt_m1_firestorm(int16_t *); */ /* void ntt_kyber_1234_567_manual_st4_opt_m1_firestorm(int16_t *); */ +void intt_kyber_123_4567_opt_m1_firestorm(int16_t *); +void intt_kyber_123_4567_manual_ld4_opt_m1_firestorm(int16_t *); // M1 Icestorm void ntt_kyber_123_4567_manual_st4_opt_m1_icestorm(int16_t *); @@ -78,6 +84,8 @@ void ntt_kyber_123_4567_scalar_load_store_opt_m1_icestorm(int16_t *); void ntt_kyber_123_4567_scalar_store_opt_m1_icestorm(int16_t *); /* void ntt_kyber_1234_567_opt_m1_icestorm(int16_t *); */ /* void ntt_kyber_1234_567_manual_st4_opt_m1_icestorm(int16_t *); */ +void intt_kyber_123_4567_opt_m1_icestorm(int16_t *); +void intt_kyber_123_4567_manual_ld4_opt_m1_icestorm(int16_t *); #define NTT_LAYERS 8 #define NTT_SIZE (1u << NTT_LAYERS) @@ -366,19 +374,25 @@ MAKE_TEST_FWD(asm_1234_567, 0, ntt_kyber_1234_567, ntt_ct,0,1,1) MAKE_TEST_FWD(asm_123_4567_inv, 1, intt_kyber_123_4567, invntt_gs,0,0,1) MAKE_TEST_FWD(asm_123_4567_inv_manual_ld4, 1, intt_kyber_123_4567_manual_ld4, invntt_gs,0,0,1) // Check against neon-ntt for comparability -MAKE_TEST_FWD(asm_vs_neonntt_123_4567_inv, 1, intt_kyber_123_4567, invntt,0,1,0) +// (both results are not additionally reduced for comparison reasons) +MAKE_TEST_FWD(asm_vs_pqclean_123_4567_inv, 1, intt_kyber_123_4567, pqclean_invntt,0,1,0) +MAKE_TEST_FWD(asm_vs_pqclean_123_4567_inv_manual_ld4, 1, intt_kyber_123_4567_manual_ld4, pqclean_invntt,0,1,0) // A55 MAKE_TEST_FWD(asm_123_4567_manual_st4_opt_a55, 0, ntt_kyber_123_4567_manual_st4_opt_a55, ntt_ct,0,1,1) MAKE_TEST_FWD(asm_123_4567_opt_a55, 0, ntt_kyber_123_4567_opt_a55, ntt_ct,0,1,1) MAKE_TEST_FWD(asm_123_4567_scalar_load_opt_a55, 0, ntt_kyber_123_4567_scalar_load_opt_a55, ntt_ct,0,1,1) MAKE_TEST_FWD(asm_123_4567_scalar_load_store_opt_a55, 0, ntt_kyber_123_4567_scalar_load_store_opt_a55, ntt_ct,0,1,1) MAKE_TEST_FWD(asm_123_4567_scalar_store_opt_a55, 0, ntt_kyber_123_4567_scalar_store_opt_a55, ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_inv_opt_a55, 0, intt_kyber_123_4567_opt_a55, invntt_gs,0,0,1) +MAKE_TEST_FWD(asm_123_4567_inv_manual_ld4_opt_a55, 0, intt_kyber_123_4567_manual_ld4_opt_a55, invntt_gs,0,0,1) // A72 MAKE_TEST_FWD(asm_123_4567_manual_st4_opt_a72, 0, ntt_kyber_123_4567_manual_st4_opt_a72, ntt_ct,0,1,1) MAKE_TEST_FWD(asm_123_4567_opt_a72, 0, ntt_kyber_123_4567_opt_a72, ntt_ct,0,1,1) MAKE_TEST_FWD(asm_123_4567_scalar_load_opt_a72, 0, ntt_kyber_123_4567_scalar_load_opt_a72, ntt_ct,0,1,1) MAKE_TEST_FWD(asm_123_4567_scalar_load_store_opt_a72, 0, ntt_kyber_123_4567_scalar_load_store_opt_a72, ntt_ct,0,1,1) MAKE_TEST_FWD(asm_123_4567_scalar_store_opt_a72, 0, ntt_kyber_123_4567_scalar_store_opt_a72, ntt_ct,0,1,1) +MAKE_TEST_FWD(asm_123_4567_inv_opt_a72, 0, intt_kyber_123_4567_opt_a72, invntt_gs,0,0,1) +MAKE_TEST_FWD(asm_123_4567_inv_manual_ld4_opt_a72, 0, intt_kyber_123_4567_manual_ld4_opt_a72, invntt_gs,0,0,1) // M1 Firestorm MAKE_TEST_FWD(asm_123_4567_opt_m1_firestorm, 0, ntt_kyber_123_4567_opt_m1_firestorm,ntt_ct,0,1,1) MAKE_TEST_FWD(asm_123_4567_scalar_load_opt_m1_firestorm, 0, ntt_kyber_123_4567_scalar_load_opt_m1_firestorm,ntt_ct,0,1,1) @@ -387,6 +401,8 @@ MAKE_TEST_FWD(asm_123_4567_scalar_load_store_opt_m1_firestorm, 0, ntt_kyber_123_ MAKE_TEST_FWD(asm_123_4567_manual_st4_opt_m1_firestorm, 0, ntt_kyber_123_4567_manual_st4_opt_m1_firestorm,ntt_ct,0,1,1) MAKE_TEST_FWD(asm_123_4567_scalar_store_opt_m1_firestorm, 0, ntt_kyber_123_4567_scalar_store_opt_m1_firestorm,ntt_ct,0,1,1) /* MAKE_TEST_FWD(asm_1234_567_manual_st4_opt_m1_firestorm, ntt_kyber_1234_567_manual_st4_opt_m1_firestorm,0,1) */ +MAKE_TEST_FWD(asm_123_4567_inv_opt_m1_firestorm, 0, intt_kyber_123_4567_opt_m1_firestorm, invntt_gs,0,0,1) +MAKE_TEST_FWD(asm_123_4567_inv_manual_ld4_opt_m1_firestorm, 0, intt_kyber_123_4567_manual_ld4_opt_m1_firestorm, invntt_gs,0,0,1) // M1 Icestorm MAKE_TEST_FWD(asm_123_4567_manual_st4_opt_m1_icestorm, 0, ntt_kyber_123_4567_manual_st4_opt_m1_icestorm,ntt_ct,0,1,1) MAKE_TEST_FWD(asm_123_4567_opt_m1_icestorm, 0, ntt_kyber_123_4567_opt_m1_icestorm,ntt_ct,0,1,1) @@ -395,10 +411,13 @@ MAKE_TEST_FWD(asm_123_4567_scalar_load_store_opt_m1_icestorm, 0, ntt_kyber_123_4 MAKE_TEST_FWD(asm_123_4567_scalar_store_opt_m1_icestorm, 0, ntt_kyber_123_4567_scalar_store_opt_m1_icestorm,ntt_ct,0,1,1) /* MAKE_TEST_FWD(asm_1234_567_opt_m1_icestorm, ntt_kyber_1234_567_opt_m1_icestorm,0,1) */ /* MAKE_TEST_FWD(asm_1234_567_manual_st4_opt_m1_icestorm, ntt_kyber_1234_567_manual_st4_opt_m1_icestorm,0,1) */ +MAKE_TEST_FWD(asm_123_4567_inv_opt_m1_icestorm, 0, intt_kyber_123_4567_opt_m1_icestorm, invntt_gs,0,0,1) +MAKE_TEST_FWD(asm_123_4567_inv_manual_ld4_opt_m1_icestorm, 0, intt_kyber_123_4567_manual_ld4_opt_m1_icestorm, invntt_gs,0,0,1) // other MAKE_TEST_FWD(neonntt, 0, ntt, ntt_ct,0,1,1) MAKE_TEST_FWD(pqclean, 0, pqclean_ntt, ntt_ct,0,1,1) MAKE_TEST_FWD(neonntt_inv, 1, invntt, invntt_gs,0,0,1) +MAKE_TEST_FWD(pqclean_inv, 1, pqclean_invntt, invntt_gs,0,0,1) uint64_t t0, t1; uint64_t cycles[TEST_COUNT]; @@ -443,12 +462,20 @@ MAKE_BENCH(asm_123_4567_opt_a55, ntt_kyber_123_4567_opt_a55) MAKE_BENCH(asm_123_4567_scalar_load_opt_a55, ntt_kyber_123_4567_scalar_load_opt_a55) MAKE_BENCH(asm_123_4567_scalar_load_store_opt_a55, ntt_kyber_123_4567_scalar_load_store_opt_a55) MAKE_BENCH(asm_123_4567_scalar_store_opt_a55, ntt_kyber_123_4567_scalar_store_opt_a55) +// inv +MAKE_BENCH(asm_123_4567_inv_opt_a55, intt_kyber_123_4567_opt_a55) +MAKE_BENCH(asm_123_4567_inv_manual_ld4_opt_a55, intt_kyber_123_4567_manual_ld4_opt_a55) + // A72 MAKE_BENCH(asm_123_4567_manual_st4_opt_a72, ntt_kyber_123_4567_manual_st4_opt_a72) MAKE_BENCH(asm_123_4567_opt_a72, ntt_kyber_123_4567_opt_a72) MAKE_BENCH(asm_123_4567_scalar_load_opt_a72, ntt_kyber_123_4567_scalar_load_opt_a72) MAKE_BENCH(asm_123_4567_scalar_load_store_opt_a72, ntt_kyber_123_4567_scalar_load_store_opt_a72) MAKE_BENCH(asm_123_4567_scalar_store_opt_a72, ntt_kyber_123_4567_scalar_store_opt_a72) +// inv +MAKE_BENCH(asm_123_4567_inv_opt_a72, intt_kyber_123_4567_opt_a72) +MAKE_BENCH(asm_123_4567_inv_manual_ld4_opt_a72, intt_kyber_123_4567_manual_ld4_opt_a72) + // M1 Firestorm MAKE_BENCH(asm_123_4567_opt_m1_firestorm, ntt_kyber_123_4567_opt_m1_firestorm) MAKE_BENCH(asm_123_4567_scalar_load_opt_m1_firestorm, ntt_kyber_123_4567_scalar_load_opt_m1_firestorm) @@ -457,6 +484,10 @@ MAKE_BENCH(asm_123_4567_scalar_load_store_opt_m1_firestorm, ntt_kyber_123_4567_s MAKE_BENCH(asm_123_4567_manual_st4_opt_m1_firestorm, ntt_kyber_123_4567_manual_st4_opt_m1_firestorm) MAKE_BENCH(asm_123_4567_scalar_store_opt_m1_firestorm, ntt_kyber_123_4567_scalar_store_opt_m1_firestorm) /* MAKE_BENCH(asm_1234_567_manual_st4_opt_m1_firestorm, ntt_kyber_1234_567_manual_st4_opt_m1_firestorm) */ +// inv +MAKE_BENCH(asm_123_4567_inv_opt_m1_firestorm, intt_kyber_123_4567_opt_m1_firestorm) +MAKE_BENCH(asm_123_4567_inv_manual_ld4_opt_m1_firestorm, intt_kyber_123_4567_manual_ld4_opt_m1_firestorm) + // M1 Icestorm MAKE_BENCH(asm_123_4567_manual_st4_opt_m1_icestorm, ntt_kyber_123_4567_manual_st4_opt_m1_icestorm) MAKE_BENCH(asm_123_4567_opt_m1_icestorm, ntt_kyber_123_4567_opt_m1_icestorm) @@ -465,10 +496,15 @@ MAKE_BENCH(asm_123_4567_scalar_load_store_opt_m1_icestorm, ntt_kyber_123_4567_sc MAKE_BENCH(asm_123_4567_scalar_store_opt_m1_icestorm, ntt_kyber_123_4567_scalar_store_opt_m1_icestorm) /* MAKE_BENCH(asm_1234_567_opt_m1_icestorm, ntt_kyber_1234_567_opt_m1_icestorm) */ /* MAKE_BENCH(asm_1234_567_manual_st4_opt_m1_icestorm, ntt_kyber_1234_567_manual_st4_opt_m1_icestorm) */ +// inv +MAKE_BENCH(asm_123_4567_inv_opt_m1_icestorm, intt_kyber_123_4567_opt_m1_icestorm) +MAKE_BENCH(asm_123_4567_inv_manual_ld4_opt_m1_icestorm, intt_kyber_123_4567_manual_ld4_opt_m1_icestorm) + // other MAKE_BENCH(neonntt,ntt) MAKE_BENCH(pqclean,pqclean_ntt) MAKE_BENCH(neonntt_inv,invntt) +MAKE_BENCH(pqclean_inv,pqclean_invntt) int main( void ) { @@ -514,10 +550,8 @@ int main( void ) return (1); } - if (test_ntt_asm_vs_neonntt_123_4567_inv() != 0) - { - return (1); - } + if (test_ntt_asm_vs_pqclean_123_4567_inv() != 0){return (1);} + if (test_ntt_asm_vs_pqclean_123_4567_inv_manual_ld4() != 0){return (1);} if (test_ntt_asm_123_4567_manual_st4_opt_a55() != 0) { @@ -544,6 +578,9 @@ int main( void ) return (1); } + if (test_ntt_asm_123_4567_inv_opt_a55() != 0){return (1);} + if (test_ntt_asm_123_4567_inv_manual_ld4_opt_a55() != 0){return (1);} + if (test_ntt_asm_123_4567_manual_st4_opt_a72() != 0) { return (1); @@ -568,6 +605,8 @@ int main( void ) { return (1); } + if (test_ntt_asm_123_4567_inv_opt_a72() != 0){return (1);} + if (test_ntt_asm_123_4567_inv_manual_ld4_opt_a72() != 0){return (1);} // M1 Firestorm if(test_ntt_asm_123_4567_opt_m1_firestorm() != 0){return (1);} if(test_ntt_asm_123_4567_scalar_load_opt_m1_firestorm() != 0){return (1);} @@ -576,6 +615,8 @@ int main( void ) if(test_ntt_asm_123_4567_scalar_store_opt_m1_firestorm() != 0){return (1);} /* if(test_ntt_asm_1234_567_opt_m1_firestorm() != 0){return (1);} */ /* if(test_ntt_asm_1234_567_manual_st4_opt_m1_firestorm() != 0){return (1);} */ + if (test_ntt_asm_123_4567_inv_opt_m1_firestorm() != 0){return (1);} + if (test_ntt_asm_123_4567_inv_manual_ld4_opt_m1_firestorm() != 0){return (1);} // M1 Icestorm if(test_ntt_asm_123_4567_manual_st4_opt_m1_icestorm() != 0){return (1);} @@ -596,8 +637,8 @@ int main( void ) { return(1); } - if( test_ntt_pqclean()!= 0 ) - return(1); + if( test_ntt_pqclean()!= 0 ){return(1);} + if( test_ntt_pqclean_inv()!= 0 ){return(1);} #endif /* DO_TEST */ #if defined(DO_BENCH) @@ -617,12 +658,16 @@ int main( void ) bench_ntt_asm_123_4567_scalar_load_opt_a55(); bench_ntt_asm_123_4567_scalar_load_store_opt_a55(); bench_ntt_asm_123_4567_scalar_store_opt_a55(); + bench_ntt_asm_123_4567_inv_opt_a55(); + bench_ntt_asm_123_4567_inv_manual_ld4_opt_a55(); /* A72 */ bench_ntt_asm_123_4567_manual_st4_opt_a72(); bench_ntt_asm_123_4567_opt_a72(); bench_ntt_asm_123_4567_scalar_load_opt_a72(); bench_ntt_asm_123_4567_scalar_load_store_opt_a72(); bench_ntt_asm_123_4567_scalar_store_opt_a72(); + bench_ntt_asm_123_4567_inv_opt_a72(); + bench_ntt_asm_123_4567_inv_manual_ld4_opt_a72(); // M1 Firestorm bench_ntt_asm_123_4567_opt_m1_firestorm(); bench_ntt_asm_123_4567_scalar_load_opt_m1_firestorm(); @@ -631,6 +676,8 @@ int main( void ) bench_ntt_asm_123_4567_scalar_store_opt_m1_firestorm(); /* bench_ntt_asm_1234_567_opt_m1_firestorm(); */ /* bench_ntt_asm_1234_567_manual_st4_opt_m1_firestorm(); */ + bench_ntt_asm_123_4567_inv_opt_m1_firestorm(); + bench_ntt_asm_123_4567_inv_manual_ld4_opt_m1_firestorm(); // M1 Icestorm bench_ntt_asm_123_4567_manual_st4_opt_m1_icestorm(); bench_ntt_asm_123_4567_opt_m1_icestorm(); @@ -639,11 +686,14 @@ int main( void ) bench_ntt_asm_123_4567_scalar_store_opt_m1_icestorm(); /* bench_ntt_asm_1234_567_opt_m1_icestorm(); */ /* bench_ntt_asm_1234_567_manual_st4_opt_m1_icestorm(); */ + bench_ntt_asm_123_4567_inv_opt_m1_icestorm(); + bench_ntt_asm_123_4567_inv_manual_ld4_opt_m1_icestorm(); bench_ntt_neonntt(); bench_ntt_pqclean(); bench_ntt_neonntt_inv(); + bench_ntt_pqclean_inv(); #endif /* DO_BENCH */ debug_printf( "- Disable cycle counter ..." ); diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_a55.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_a55.s new file mode 100644 index 0000000..1795f87 --- /dev/null +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_a55.s @@ -0,0 +1,1482 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_manual_ld4_opt_a55 + .global _intt_kyber_123_4567_manual_ld4_opt_a55 + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_manual_ld4_opt_a55: +_intt_kyber_123_4567_manual_ld4_opt_a55: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 + ld4 {v26.4S, v27.4S, v28.4S, v29.4S}, [x1] // ..*.............................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + ldr q2, [x4, #48] // ........*........................................ + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sub v16.8H, v28.8H, v29.8H // .....*........................................... + // gap // ................................................. + sub v21.8H, v26.8H, v27.8H // ....*............................................ + // gap // ................................................. + ldr q23, [x4, #64] // ...*............................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sqrdmulh v0.8H, v21.8H, v2.8H // ...........*..................................... + // gap // ................................................. + ldr q2, [x4, #80] // ......*.......................................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + ldr q20, [x4, #32] // .*............................................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sqrdmulh v2.8H, v16.8H, v2.8H // .........*....................................... + // gap // ................................................. + mul v25.8H, v16.8H, v23.8H // .......*......................................... + // gap // ................................................. + mul v4.8H, v21.8H, v20.8H // ..........*...................................... + // gap // ................................................. + add v17.8H, v26.8H, v27.8H // .............*................................... + // gap // ................................................. + add v30.8H, v28.8H, v29.8H // ............*.................................... + // gap // ................................................. + mls v25.8H, v2.8H, v7.H[0] // ..............*.................................. + // gap // ................................................. + mls v4.8H, v0.8H, v7.H[0] // ...............*................................. + // gap // ................................................. + sub v21.8H, v17.8H, v30.8H // ................*................................ + // gap // ................................................. + ldr q2, [x4, #16] // *................................................ + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sub v0.8H, v4.8H, v25.8H // ..................*.............................. + // gap // ................................................. + ldr q23, [x4], #(6*16) // .................*............................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sqrdmulh v16.8H, v21.8H, v2.8H // ......................*.......................... + // gap // ................................................. + sqrdmulh v2.8H, v0.8H, v2.8H // .......................*......................... + // gap // ................................................. + mul v26.8H, v0.8H, v23.8H // ....................*............................ + // gap // ................................................. + mul v20.8H, v21.8H, v23.8H // ...................*............................. + // gap // ................................................. + add v0.8H, v4.8H, v25.8H // ........................*........................ + // gap // ................................................. + add v21.8H, v17.8H, v30.8H // .....................*........................... + // gap // ................................................. + mls v26.8H, v2.8H, v7.H[0] // ............................*.................... + // gap // ................................................. + mls v20.8H, v16.8H, v7.H[0] // .........................*....................... + // gap // ................................................. + trn1 v23.4S, v21.4S, v0.4S // ..........................*...................... + // gap // ................................................. + trn2 v16.4S, v21.4S, v0.4S // ...........................*..................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn2 v21.4S, v20.4S, v26.4S // ..............................*.................. + // gap // ................................................. + trn1 v2.4S, v20.4S, v26.4S // .............................*................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn1 v0.2D, v16.2D, v21.2D // ..................................*.............. + // gap // ................................................. + trn2 v26.2D, v16.2D, v21.2D // ................................*................ + // gap // ................................................. + trn2 v20.2D, v23.2D, v2.2D // ...............................*................. + // gap // ................................................. + trn1 v2.2D, v23.2D, v2.2D // .................................*............... + // gap // ................................................. + add v23.8H, v20.8H, v26.8H // .....................................*........... + // gap // ................................................. + add v21.8H, v2.8H, v0.8H // ....................................*............ + // gap // ................................................. + sub v0.8H, v2.8H, v0.8H // ........................................*........ + // gap // ................................................. + sqdmulh v2.8H, v23.8H, v7.H[1] // .......................................*......... + // gap // ................................................. + sqdmulh v16.8H, v21.8H, v7.H[1] // ......................................*.......... + // gap // ................................................. + ldr q11, [x3], #16 // .............................................*... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + srshr v2.8H, v2.8H, #11 // ..........................................*...... + // gap // ................................................. + srshr v16.8H, v16.8H, #11 // .........................................*....... + // gap // ................................................. + sub v15.8H, v20.8H, v26.8H // ...................................*............. + // gap // ................................................. + mls v23.8H, v2.8H, v7.H[0] // ............................................*.... + // gap // ................................................. + mls v21.8H, v16.8H, v7.H[0] // ...........................................*..... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + add v2.8H, v21.8H, v23.8H // ..............................................*.. + // gap // ................................................. + sub v31.8H, v21.8H, v23.8H // ...............................................*. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + str q2, [x1], #(64) // ................................................* + // gap // ................................................. + + // original source code + // ldr q13, [x4, #16] // ................*................................ + // ldr q10, [x4, #32] // .......*......................................... + // ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // *................................................ + // ldr q28, [x4, #64] // ....*............................................ + // sub v30.8H, v3.8H, v4.8H // ...*............................................. + // sub v16.8H, v5.8H, v6.8H // ..*.............................................. + // ldr q21, [x4, #80] // ......*.......................................... + // mul v27.8H, v16.8H, v28.8H // .........*....................................... + // ldr q20, [x4, #48] // .*............................................... + // sqrdmulh v17.8H, v16.8H, v21.8H // ........*........................................ + // mul v16.8H, v30.8H, v10.8H // ..........*...................................... + // sqrdmulh v20.8H, v30.8H, v20.8H // .....*........................................... + // add v8.8H, v5.8H, v6.8H // ............*.................................... + // add v4.8H, v3.8H, v4.8H // ...........*..................................... + // mls v27.8H, v17.8H, v7.H[0] // .............*................................... + // mls v16.8H, v20.8H, v7.H[0] // ..............*.................................. + // sub v26.8H, v4.8H, v8.8H // ...............*................................. + // ldr q25, [x4], #(6*16) // ..................*.............................. + // sub v12.8H, v16.8H, v27.8H // .................*............................... + // mul v24.8H, v26.8H, v25.8H // ......................*.......................... + // mul v14.8H, v12.8H, v25.8H // .....................*........................... + // add v20.8H, v4.8H, v8.8H // ........................*........................ + // sqrdmulh v26.8H, v26.8H, v13.8H // ...................*............................. + // sqrdmulh v0.8H, v12.8H, v13.8H // ....................*............................ + // add v16.8H, v16.8H, v27.8H // .......................*......................... + // mls v24.8H, v26.8H, v7.H[0] // ..........................*...................... + // trn1 v2.4S, v20.4S, v16.4S // ...........................*..................... + // trn2 v28.4S, v20.4S, v16.4S // ............................*.................... + // mls v14.8H, v0.8H, v7.H[0] // .........................*....................... + // trn1 v23.4S, v24.4S, v14.4S // ..............................*.................. + // trn2 v0.4S, v24.4S, v14.4S // .............................*................... + // trn2 v21.2D, v2.2D, v23.2D // .................................*............... + // trn2 v29.2D, v28.2D, v0.2D // ................................*................ + // trn1 v2.2D, v2.2D, v23.2D // ..................................*.............. + // trn1 v0.2D, v28.2D, v0.2D // ...............................*................. + // sub v15.8H, v21.8H, v29.8H // ...........................................*..... + // add v23.8H, v2.8H, v0.8H // ....................................*............ + // add v16.8H, v21.8H, v29.8H // ...................................*............. + // sqdmulh v17.8H, v23.8H, v7.H[1] // .......................................*......... + // sqdmulh v20.8H, v16.8H, v7.H[1] // ......................................*.......... + // sub v0.8H, v2.8H, v0.8H // .....................................*........... + // srshr v17.8H, v17.8H, #11 // ..........................................*...... + // srshr v20.8H, v20.8H, #11 // .........................................*....... + // mls v23.8H, v17.8H, v7.H[0] // .............................................*... + // mls v16.8H, v20.8H, v7.H[0] // ............................................*.... + // ldr q11, [x3], #16 // ........................................*........ + // add v20.8H, v23.8H, v16.8H // ..............................................*.. + // sub v31.8H, v23.8H, v16.8H // ...............................................*. + // str q20, [x1], #(64) // ................................................* + + sub count, count, #1 +layer4567_start: + ldr q13, [x4, #16] // ..e............................................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + ldr q10, [x4, #32] // ...e.............................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // e................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + ldr q28, [x4, #64] // .....e............................................................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sub v30.8H, v3.8H, v4.8H // .......e.......................................................... + // gap // .................................................................. + sub v16.8H, v5.8H, v6.8H // ............e..................................................... + // gap // .................................................................. + ldr q21, [x4, #80] // ......e........................................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mul v27.8H, v16.8H, v28.8H // ..............e................................................... + // gap // .................................................................. + ldr q20, [x4, #48] // ....e............................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sqrdmulh v17.8H, v16.8H, v21.8H // ...............e.................................................. + // gap // .................................................................. + mul v16.8H, v30.8H, v10.8H // .........e........................................................ + // gap // .................................................................. + sqrdmulh v20.8H, v30.8H, v20.8H // ..........e....................................................... + // gap // .................................................................. + add v8.8H, v5.8H, v6.8H // .............e.................................................... + // gap // .................................................................. + add v4.8H, v3.8H, v4.8H // ........e......................................................... + // gap // .................................................................. + mls v27.8H, v17.8H, v7.H[0] // ................e................................................. + // gap // .................................................................. + mls v16.8H, v20.8H, v7.H[0] // ...........e...................................................... + // gap // .................................................................. + sub v26.8H, v4.8H, v8.8H // .................e................................................ + // gap // .................................................................. + ldr q25, [x4], #(6*16) // .e................................................................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sub v12.8H, v16.8H, v27.8H // ......................e........................................... + // gap // .................................................................. + mul v21.8H, v15.8H, v11.H[4] // ...........................................*...................... + // gap // .................................................................. + mul v24.8H, v26.8H, v25.8H // ...................e.............................................. + // gap // .................................................................. + mul v14.8H, v12.8H, v25.8H // ........................e......................................... + // gap // .................................................................. + sqrdmulh v2.8H, v15.8H, v11.H[5] // ............................................*..................... + // gap // .................................................................. + mul v23.8H, v0.8H, v11.H[2] // ......................................*........................... + // gap // .................................................................. + sqrdmulh v0.8H, v0.8H, v11.H[3] // .......................................*.......................... + // gap // .................................................................. + add v20.8H, v4.8H, v8.8H // ..................e............................................... + // gap // .................................................................. + mls v21.8H, v2.8H, v7.H[0] // .............................................*.................... + // gap // .................................................................. + sqrdmulh v26.8H, v26.8H, v13.8H // ....................e............................................. + // gap // .................................................................. + mls v23.8H, v0.8H, v7.H[0] // ........................................*......................... + // gap // .................................................................. + sqrdmulh v0.8H, v12.8H, v13.8H // .........................e........................................ + // gap // .................................................................. + add v16.8H, v16.8H, v27.8H // .......................e.......................................... + // gap // .................................................................. + mls v24.8H, v26.8H, v7.H[0] // .....................e............................................ + // gap // .................................................................. + add v30.8H, v23.8H, v21.8H // ..........................................................*....... + // gap // .................................................................. + trn1 v2.4S, v20.4S, v16.4S // ...........................e...................................... + // gap // .................................................................. + trn2 v28.4S, v20.4S, v16.4S // ............................e..................................... + // gap // .................................................................. + sqrdmulh v16.8H, v31.8H, v11.H[1] // .......................................................*.......... + // gap // .................................................................. + mls v14.8H, v0.8H, v7.H[0] // ..........................e....................................... + // gap // .................................................................. + sub v26.8H, v23.8H, v21.8H // .........................................................*........ + // gap // .................................................................. + mul v22.8H, v31.8H, v11.H[0] // ......................................................*........... + // gap // .................................................................. + str q30, [x1, #-48] // ...............................................................*.. + // gap // .................................................................. + trn1 v23.4S, v24.4S, v14.4S // .............................e.................................... + // gap // .................................................................. + trn2 v0.4S, v24.4S, v14.4S // ..............................e................................... + // gap // .................................................................. + mls v22.8H, v16.8H, v7.H[0] // ........................................................*......... + // gap // .................................................................. + trn2 v21.2D, v2.2D, v23.2D // ...............................e.................................. + // gap // .................................................................. + trn2 v29.2D, v28.2D, v0.2D // ................................e................................. + // gap // .................................................................. + trn1 v2.2D, v2.2D, v23.2D // .................................e................................ + // gap // .................................................................. + trn1 v0.2D, v28.2D, v0.2D // ..................................e............................... + // gap // .................................................................. + sub v15.8H, v21.8H, v29.8H // .........................................e........................ + // gap // .................................................................. + add v23.8H, v2.8H, v0.8H // .....................................e............................ + // gap // .................................................................. + add v16.8H, v21.8H, v29.8H // ..........................................e....................... + // gap // .................................................................. + mul v21.8H, v26.8H, v11.H[0] // ...........................................................*...... + // gap // .................................................................. + sqdmulh v17.8H, v23.8H, v7.H[1] // ..............................................e................... + // gap // .................................................................. + sqdmulh v20.8H, v16.8H, v7.H[1] // .................................................e................ + // gap // .................................................................. + sub v0.8H, v2.8H, v0.8H // ....................................e............................. + // gap // .................................................................. + sqrdmulh v2.8H, v26.8H, v11.H[1] // ............................................................*..... + // gap // .................................................................. + srshr v17.8H, v17.8H, #11 // ...............................................e.................. + // gap // .................................................................. + srshr v20.8H, v20.8H, #11 // ..................................................e............... + // gap // .................................................................. + str q22, [x1, #-32] // ................................................................*. + // gap // .................................................................. + mls v23.8H, v17.8H, v7.H[0] // ................................................e................. + // gap // .................................................................. + mls v16.8H, v20.8H, v7.H[0] // ...................................................e.............. + // gap // .................................................................. + mls v21.8H, v2.8H, v7.H[0] // .............................................................*.... + // gap // .................................................................. + ldr q11, [x3], #16 // ...................................e.............................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + add v20.8H, v23.8H, v16.8H // .....................................................e............ + // gap // .................................................................. + str q21, [x1, #-16] // .................................................................* + // gap // .................................................................. + sub v31.8H, v23.8H, v16.8H // ....................................................e............. + // gap // .................................................................. + str q20, [x1], #(64) // ..............................................................e... + // gap // .................................................................. + + // original source code + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // ..e...............................................................|.e............................................................. + // ldr q0, [x4], #(6*16) // .................e................................................|................e.............................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // e.................................................................e............................................................... + // ldr q1, [x4, #(-6*16 + 2*16)] // .e................................................................|e.............................................................. + // ldr q5, [x4, #(-6*16 + 3*16)] // ........e.........................................................|.......e....................................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // ...e..............................................................|..e............................................................ + // ldr q6, [x4, #(-6*16 + 5*16)] // ......e...........................................................|.....e......................................................... + // sub v24.8h, v8.8h, v9.8h // ....e.............................................................|...e........................................................... + // add v8.8h, v8.8h, v9.8h // .............e....................................................|............e.................................................. + // mul v9.8h, v24.8h, v1.8h // ..........e.......................................................|.........e..................................................... + // sqrdmulh v24.8h, v24.8h, v5.8h // ...........e......................................................|..........e.................................................... + // mls v9.8h, v24.8h, v7.h[0] // ...............e..................................................|..............e................................................ + // sub v24.8h, v10.8h, v11.8h // .....e............................................................|....e.......................................................... + // add v10.8h, v10.8h, v11.8h // ............e.....................................................|...........e................................................... + // mul v11.8h, v24.8h, v2.8h // .......e..........................................................|......e........................................................ + // sqrdmulh v24.8h, v24.8h, v6.8h // .........e........................................................|........e...................................................... + // mls v11.8h, v24.8h, v7.h[0] // ..............e...................................................|.............e................................................. + // sub v24.8h, v8.8h, v10.8h // ................e.................................................|...............e............................................... + // add v8.8h, v8.8h, v10.8h // .........................e........................................|........................e...................................... + // mul v10.8h, v24.8h, v0.8h // ....................e.............................................|...................e........................................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ...........................e......................................|..........................e.................................... + // mls v10.8h, v24.8h, v7.h[0] // ...............................e..................................|..............................e................................ + // sub v24.8h, v9.8h, v11.8h // ..................e...............................................|.................e............................................. + // add v9.8h, v9.8h, v11.8h // ..............................e...................................|.............................e................................. + // mul v11.8h, v24.8h, v0.8h // .....................e............................................|....................e.......................................... + // sqrdmulh v24.8h, v24.8h, v4.8h // .............................e....................................|............................e.................................. + // mls v11.8h, v24.8h, v7.h[0] // ....................................e.............................|...................................e........................... + // trn1 v25.4s, v8.4s, v9.4s // .................................e................................|................................e.............................. + // trn2 v26.4s, v8.4s, v9.4s // ..................................e...............................|.................................e............................. + // trn1 v27.4s, v10.4s, v11.4s // ........................................e.........................|.......................................e....................... + // trn2 v28.4s, v10.4s, v11.4s // .........................................e........................|........................................e...................... + // trn2 v10.2d, v25.2d, v27.2d // ...........................................e......................|..........................................e.................... + // trn2 v11.2d, v26.2d, v28.2d // ............................................e.....................|...........................................e................... + // trn1 v8.2d, v25.2d, v27.2d // .............................................e....................|............................................e.................. + // trn1 v9.2d, v26.2d, v28.2d // ..............................................e...................|.............................................e................. + // ldr q0, [x3], #16 // .............................................................e....|............................................................e.. + // sub v24.8h, v8.8h, v9.8h // .....................................................e............|....................................................e.......... + // add v8.8h, v8.8h, v9.8h // ................................................e.................|...............................................e............... + // mul v9.8h, v24.8h, v0.h[2] // .......................*..........................................|......................*........................................ + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ........................*.........................................|.......................*....................................... + // mls v9.8h, v24.8h, v7.h[0] // ............................*.....................................|...........................*................................... + // sub v24.8h, v10.8h, v11.8h // ...............................................e..................|..............................................e................ + // add v10.8h, v10.8h, v11.8h // .................................................e................|................................................e.............. + // mul v11.8h, v24.8h, v0.h[4] // ...................*..............................................|..................*............................................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ......................*...........................................|.....................*......................................... + // mls v11.8h, v24.8h, v7.h[0] // ..........................*.......................................|.........................*..................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...................................................e..............|..................................................e............ + // srshr v25.8h, v25.8h, #11 // .......................................................e..........|......................................................e........ + // mls v8.8h, v25.8h, v7.h[0] // ..........................................................e.......|.........................................................e..... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ....................................................e.............|...................................................e........... + // srshr v25.8h, v25.8h, #11 // ........................................................e.........|.......................................................e....... + // mls v10.8h, v25.8h, v7.h[0] // ...........................................................e......|..........................................................e.... + // sub v24.8h, v8.8h, v10.8h // ................................................................e.|............................................................... + // add v8.8h, v8.8h, v10.8h // ..............................................................e...|.............................................................e. + // mul v10.8h, v24.8h, v0.h[0] // ......................................*...........................|.....................................*......................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...................................*..............................|..................................*............................ + // mls v10.8h, v24.8h, v7.h[0] // ..........................................*.......................|.........................................*..................... + // sub v24.8h, v9.8h, v11.8h // .....................................*............................|....................................*.......................... + // add v9.8h, v9.8h, v11.8h // ................................*.................................|...............................*............................... + // mul v11.8h, v24.8h, v0.h[0] // ..................................................*...............|.................................................*............. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ......................................................*...........|.....................................................*......... + // mls v11.8h, v24.8h, v7.h[0] // ............................................................*.....|...........................................................*... + // str q8, [x1], #(64) // .................................................................e|............................................................... + // str q9, [x1, #(-64 + 16*1)] // .......................................*..........................|......................................*........................ + // str q10, [x1, #(-64 + 16*2)] // .........................................................*........|........................................................*...... + // str q11, [x1, #(-64 + 16*3)] // ...............................................................*..|..............................................................* + + sub count, count, #1 + cbnz count, layer4567_start + mul v2.8H, v15.8H, v11.H[4] // *................ + // gap // ................. + sqrdmulh v16.8H, v15.8H, v11.H[5] // .*............... + // gap // ................. + mul v23.8H, v0.8H, v11.H[2] // ..*.............. + // gap // ................. + sqrdmulh v0.8H, v0.8H, v11.H[3] // ...*............. + // gap // ................. + sqrdmulh v21.8H, v31.8H, v11.H[1] // .......*......... + // gap // ................. + mul v26.8H, v31.8H, v11.H[0] // .........*....... + // gap // ................. + mls v2.8H, v16.8H, v7.H[0] // ....*............ + // gap // ................. + mls v23.8H, v0.8H, v7.H[0] // .....*........... + // gap // ................. + // gap // ................. + // gap // ................. + mls v26.8H, v21.8H, v7.H[0] // ...........*..... + // gap // ................. + // gap // ................. + // gap // ................. + sub v0.8H, v23.8H, v2.8H // ........*........ + // gap // ................. + add v2.8H, v23.8H, v2.8H // ......*.......... + // gap // ................. + str q26, [x1, #-32] // ..............*.. + // gap // ................. + mul v16.8H, v0.8H, v11.H[0] // ............*.... + // gap // ................. + sqrdmulh v0.8H, v0.8H, v11.H[1] // .............*... + // gap // ................. + str q2, [x1, #-48] // ..........*...... + // gap // ................. + // gap // ................. + // gap // ................. + // gap // ................. + // gap // ................. + mls v16.8H, v0.8H, v7.H[0] // ...............*. + // gap // ................. + // gap // ................. + // gap // ................. + // gap // ................. + // gap // ................. + // gap // ................. + // gap // ................. + str q16, [x1, #-16] // ................* + // gap // ................. + + // original source code + // mul v21.8H, v15.8H, v11.H[4] // *................ + // sqrdmulh v2.8H, v15.8H, v11.H[5] // .*............... + // mul v23.8H, v0.8H, v11.H[2] // ..*.............. + // sqrdmulh v0.8H, v0.8H, v11.H[3] // ...*............. + // mls v21.8H, v2.8H, v7.H[0] // ......*.......... + // mls v23.8H, v0.8H, v7.H[0] // .......*......... + // add v30.8H, v23.8H, v21.8H // ..........*...... + // sqrdmulh v16.8H, v31.8H, v11.H[1] // ....*............ + // sub v26.8H, v23.8H, v21.8H // .........*....... + // mul v22.8H, v31.8H, v11.H[0] // .....*........... + // str q30, [x1, #-48] // ..............*.. + // mls v22.8H, v16.8H, v7.H[0] // ........*........ + // mul v21.8H, v26.8H, v11.H[0] // ............*.... + // sqrdmulh v2.8H, v26.8H, v11.H[1] // .............*... + // str q22, [x1, #-32] // ...........*..... + // mls v21.8H, v2.8H, v7.H[0] // ...............*. + // str q21, [x1, #-16] // ................* + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + ldr q20, [x0, #64] // *...... + // gap // ....... + // gap // ....... + // gap // ....... + ldr q23, [x0, #128] // .*..... + // gap // ....... + // gap // ....... + // gap // ....... + ldr q21, [x0, #192] // ..*.... + // gap // ....... + // gap // ....... + // gap // ....... + ldr q17, [x0, #256] // ...*... + // gap // ....... + // gap // ....... + // gap // ....... + ldr q4, [x0, #320] // ....*.. + // gap // ....... + // gap // ....... + // gap // ....... + ldr q13, [x0, #384] // .....*. + // gap // ....... + // gap // ....... + // gap // ....... + ldr q3, [x0, #448] // ......* + // gap // ....... + + // original source code + // ldr q20, [x0, #64] // *...... + // ldr q23, [x0, #128] // .*..... + // ldr q21, [x0, #192] // ..*.... + // ldr q17, [x0, #256] // ...*... + // ldr q4, [x0, #320] // ....*.. + // ldr q13, [x0, #384] // .....*. + // ldr q3, [x0, #448] // ......* + + sub count, count, #1 +layer123_start: + ldr q16, [x0, #0] // *............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v2.8H, v23.8H, v21.8H // .............*................................................................................ + // gap // .............................................................................................. + add v23.8H, v23.8H, v21.8H // ..............*............................................................................... + // gap // .............................................................................................. + sub v21.8H, v16.8H, v20.8H // ........*..................................................................................... + // gap // .............................................................................................. + add v16.8H, v16.8H, v20.8H // .........*.................................................................................... + // gap // .............................................................................................. + mul v26.8H, v2.8H, v1.H[0] // ...............*.............................................................................. + // gap // .............................................................................................. + sqrdmulh v2.8H, v2.8H, v1.H[1] // ................*............................................................................. + // gap // .............................................................................................. + sub v20.8H, v16.8H, v23.8H // ............................*................................................................. + // gap // .............................................................................................. + add v16.8H, v16.8H, v23.8H // .............................*................................................................ + // gap // .............................................................................................. + mul v23.8H, v21.8H, v0.H[6] // ..........*................................................................................... + // gap // .............................................................................................. + sqrdmulh v21.8H, v21.8H, v0.H[7] // ...........*.................................................................................. + // gap // .............................................................................................. + mls v26.8H, v2.8H, v7.H[0] // .................*............................................................................ + // gap // .............................................................................................. + sub v2.8H, v17.8H, v4.8H // ..................*........................................................................... + // gap // .............................................................................................. + add v17.8H, v17.8H, v4.8H // ...................*.......................................................................... + // gap // .............................................................................................. + mls v23.8H, v21.8H, v7.H[0] // ............*................................................................................. + // gap // .............................................................................................. + mul v21.8H, v2.8H, v1.H[2] // ....................*......................................................................... + // gap // .............................................................................................. + mul v4.8H, v20.8H, v0.H[2] // ..............................*............................................................... + // gap // .............................................................................................. + sqrdmulh v20.8H, v20.8H, v0.H[3] // ...............................*.............................................................. + // gap // .............................................................................................. + sqdmulh v25.8H, v16.8H, v7.H[1] // ................................................*............................................. + // gap // .............................................................................................. + sqrdmulh v2.8H, v2.8H, v1.H[3] // .....................*........................................................................ + // gap // .............................................................................................. + sub v11.8H, v13.8H, v3.8H // .......................*...................................................................... + // gap // .............................................................................................. + add v13.8H, v13.8H, v3.8H // ........................*..................................................................... + // gap // .............................................................................................. + srshr v25.8H, v25.8H, #11 // .................................................*............................................ + // gap // .............................................................................................. + mls v21.8H, v2.8H, v7.H[0] // ......................*....................................................................... + // gap // .............................................................................................. + sub v2.8H, v23.8H, v26.8H // .................................*............................................................ + // gap // .............................................................................................. + mls v16.8H, v25.8H, v7.H[0] // ..................................................*........................................... + // gap // .............................................................................................. + add v23.8H, v23.8H, v26.8H // ..................................*........................................................... + // gap // .............................................................................................. + mul v26.8H, v11.8H, v1.H[4] // .........................*.................................................................... + // gap // .............................................................................................. + sub v25.8H, v17.8H, v13.8H // ......................................*....................................................... + // gap // .............................................................................................. + add v17.8H, v17.8H, v13.8H // .......................................*...................................................... + // gap // .............................................................................................. + sqrdmulh v11.8H, v11.8H, v1.H[5] // ..........................*................................................................... + // gap // .............................................................................................. + mls v4.8H, v20.8H, v7.H[0] // ................................*............................................................. + // gap // .............................................................................................. + mul v20.8H, v2.8H, v0.H[2] // ...................................*.......................................................... + // gap // .............................................................................................. + sqrdmulh v2.8H, v2.8H, v0.H[3] // ....................................*......................................................... + // gap // .............................................................................................. + mls v26.8H, v11.8H, v7.H[0] // ...........................*.................................................................. + // gap // .............................................................................................. + mul v11.8H, v25.8H, v0.H[4] // ........................................*..................................................... + // gap // .............................................................................................. + sqrdmulh v25.8H, v25.8H, v0.H[5] // .........................................*.................................................... + // gap // .............................................................................................. + mls v20.8H, v2.8H, v7.H[0] // .....................................*........................................................ + // gap // .............................................................................................. + sub v2.8H, v21.8H, v26.8H // ...........................................*.................................................. + // gap // .............................................................................................. + add v21.8H, v21.8H, v26.8H // ............................................*................................................. + // gap // .............................................................................................. + mls v11.8H, v25.8H, v7.H[0] // ..........................................*................................................... + // gap // .............................................................................................. + mul v26.8H, v2.8H, v0.H[4] // .............................................*................................................ + // gap // .............................................................................................. + sqrdmulh v2.8H, v2.8H, v0.H[5] // ..............................................*............................................... + // gap // .............................................................................................. + sqdmulh v25.8H, v17.8H, v7.H[1] // ...................................................*.......................................... + // gap // .............................................................................................. + sub v13.8H, v23.8H, v21.8H // ...........................................................*.................................. + // gap // .............................................................................................. + add v23.8H, v23.8H, v21.8H // ............................................................*................................. + // gap // .............................................................................................. + mls v26.8H, v2.8H, v7.H[0] // ...............................................*.............................................. + // gap // .............................................................................................. + srshr v2.8H, v25.8H, #11 // ....................................................*......................................... + // gap // .............................................................................................. + mul v21.8H, v13.8H, v0.H[0] // .............................................................*................................ + // gap // .............................................................................................. + sqrdmulh v25.8H, v13.8H, v0.H[1] // ..............................................................*............................... + // gap // .............................................................................................. + mls v17.8H, v2.8H, v7.H[0] // .....................................................*........................................ + // gap // .............................................................................................. + sub v2.8H, v4.8H, v11.8H // ................................................................*............................. + // gap // .............................................................................................. + add v4.8H, v4.8H, v11.8H // .................................................................*............................ + // gap // .............................................................................................. + mls v21.8H, v25.8H, v7.H[0] // ...............................................................*.............................. + // gap // .............................................................................................. + sub v25.8H, v16.8H, v17.8H // ......................................................*....................................... + // gap // .............................................................................................. + add v16.8H, v16.8H, v17.8H // .......................................................*...................................... + // gap // .............................................................................................. + mul v17.8H, v2.8H, v0.H[0] // ..................................................................*........................... + // gap // .............................................................................................. + mul v11.8H, v25.8H, v0.H[0] // ........................................................*..................................... + // gap // .............................................................................................. + sqrdmulh v25.8H, v25.8H, v0.H[1] // .........................................................*.................................... + // gap // .............................................................................................. + sqrdmulh v2.8H, v2.8H, v0.H[1] // ...................................................................*.......................... + // gap // .............................................................................................. + sub v13.8H, v20.8H, v26.8H // .....................................................................*........................ + // gap // .............................................................................................. + add v26.8H, v20.8H, v26.8H // ......................................................................*....................... + // gap // .............................................................................................. + mls v11.8H, v25.8H, v7.H[0] // ..........................................................*................................... + // gap // .............................................................................................. + mls v17.8H, v2.8H, v7.H[0] // ....................................................................*......................... + // gap // .............................................................................................. + mul v2.8H, v13.8H, v0.H[0] // .......................................................................*...................... + // gap // .............................................................................................. + sqrdmulh v20.8H, v13.8H, v0.H[1] // ........................................................................*..................... + // gap // .............................................................................................. + str q11, [x0, #256] // ..........................................................................*................... + // gap // .............................................................................................. + mul v25.8H, v16.8H, v29.8H // ..............................................................................*............... + // gap // .............................................................................................. + str q21, [x0, #320] // ...........................................................................*.................. + // gap // .............................................................................................. + mls v2.8H, v20.8H, v7.H[0] // .........................................................................*.................... + // gap // .............................................................................................. + str q17, [x0, #384] // ............................................................................*................. + // gap // .............................................................................................. + sqrdmulh v16.8H, v16.8H, v30.8H // ...............................................................................*.............. + // gap // .............................................................................................. + mul v21.8H, v23.8H, v29.8H // .................................................................................*............ + // gap // .............................................................................................. + str q2, [x0, #448] // .............................................................................*................ + // gap // .............................................................................................. + sqrdmulh v2.8H, v23.8H, v30.8H // ..................................................................................*........... + // gap // .............................................................................................. + mls v25.8H, v16.8H, v7.H[0] // ................................................................................*............. + // gap // .............................................................................................. + mul v16.8H, v4.8H, v29.8H // ....................................................................................*......... + // gap // .............................................................................................. + sqrdmulh v23.8H, v4.8H, v30.8H // .....................................................................................*........ + // gap // .............................................................................................. + mls v21.8H, v2.8H, v7.H[0] // ...................................................................................*.......... + // gap // .............................................................................................. + mul v2.8H, v26.8H, v29.8H // .......................................................................................*...... + // gap // .............................................................................................. + sqrdmulh v26.8H, v26.8H, v30.8H // ........................................................................................*..... + // gap // .............................................................................................. + mls v16.8H, v23.8H, v7.H[0] // ......................................................................................*....... + // gap // .............................................................................................. + str q25, [x0], #(16) // ..........................................................................................*... + // gap // .............................................................................................. + ldr q20, [x0, #64] // .e............................................................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v2.8H, v26.8H, v7.H[0] // .........................................................................................*.... + // gap // .............................................................................................. + str q21, [x0, #48] // ...........................................................................................*.. + // gap // .............................................................................................. + ldr q23, [x0, #128] // ..e........................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + str q16, [x0, #112] // ............................................................................................*. + // gap // .............................................................................................. + ldr q21, [x0, #192] // ...e.......................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + str q2, [x0, #176] // .............................................................................................* + // gap // .............................................................................................. + ldr q17, [x0, #256] // ....e......................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + ldr q4, [x0, #320] // .....e........................................................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + ldr q13, [x0, #384] // ......e....................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + ldr q3, [x0, #448] // .......e...................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + + // original source code + // ldr q8, [x0, #0] // ...........*......................................................................................... + // ldr q9, [x0, #(1*(512/8))] // e..........|..................................................................................e...... + // ldr q10, [x0, #(2*(512/8))] // ...e.......|.....................................................................................e... + // ldr q11, [x0, #(3*(512/8))] // .....e.....|.......................................................................................e. + // ldr q12, [x0, #(4*(512/8))] // .......e...|......................................................................................... + // ldr q13, [x0, #(5*(512/8))] // ........e..|......................................................................................... + // ldr q14, [x0, #(6*(512/8))] // .........e.|......................................................................................... + // ldr q15, [x0, #(7*(512/8))] // ..........e|......................................................................................... + // sub v24.8h, v8.8h, v9.8h // ...........|..*...................................................................................... + // add v8.8h, v8.8h, v9.8h // ...........|...*..................................................................................... + // mul v9.8h, v24.8h, v0.h[6] // ...........|........*................................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[7] // ...........|.........*............................................................................... + // mls v9.8h, v24.8h, v7.h[0] // ...........|.............*........................................................................... + // sub v24.8h, v10.8h, v11.8h // ...........|*........................................................................................ + // add v10.8h, v10.8h, v11.8h // ...........|.*....................................................................................... + // mul v11.8h, v24.8h, v1.h[0] // ...........|....*.................................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ...........|.....*................................................................................... + // mls v11.8h, v24.8h, v7.h[0] // ...........|..........*.............................................................................. + // sub v24.8h, v12.8h, v13.8h // ...........|...........*............................................................................. + // add v12.8h, v12.8h, v13.8h // ...........|............*............................................................................ + // mul v13.8h, v24.8h, v1.h[2] // ...........|..............*.......................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // ...........|..................*...................................................................... + // mls v13.8h, v24.8h, v7.h[0] // ...........|......................*.................................................................. + // sub v24.8h, v14.8h, v15.8h // ...........|...................*..................................................................... + // add v14.8h, v14.8h, v15.8h // ...........|....................*.................................................................... + // mul v15.8h, v24.8h, v1.h[4] // ...........|..........................*.............................................................. + // sqrdmulh v24.8h, v24.8h, v1.h[5] // ...........|.............................*........................................................... + // mls v15.8h, v24.8h, v7.h[0] // ...........|.................................*....................................................... + // sub v24.8h, v8.8h, v10.8h // ...........|......*.................................................................................. + // add v8.8h, v8.8h, v10.8h // ...........|.......*................................................................................. + // mul v10.8h, v24.8h, v0.h[2] // ...........|...............*......................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........|................*........................................................................ + // mls v10.8h, v24.8h, v7.h[0] // ...........|..............................*.......................................................... + // sub v24.8h, v9.8h, v11.8h // ...........|.......................*................................................................. + // add v9.8h, v9.8h, v11.8h // ...........|.........................*............................................................... + // mul v11.8h, v24.8h, v0.h[2] // ...........|...............................*......................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........|................................*........................................................ + // mls v11.8h, v24.8h, v7.h[0] // ...........|....................................*.................................................... + // sub v24.8h, v12.8h, v14.8h // ...........|...........................*............................................................. + // add v12.8h, v12.8h, v14.8h // ...........|............................*............................................................ + // mul v14.8h, v24.8h, v0.h[4] // ...........|..................................*...................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...........|...................................*..................................................... + // mls v14.8h, v24.8h, v7.h[0] // ...........|.......................................*................................................. + // sub v24.8h, v13.8h, v15.8h // ...........|.....................................*................................................... + // add v13.8h, v13.8h, v15.8h // ...........|......................................*.................................................. + // mul v15.8h, v24.8h, v0.h[4] // ...........|........................................*................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...........|.........................................*............................................... + // mls v15.8h, v24.8h, v7.h[0] // ...........|.............................................*........................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...........|.................*....................................................................... + // srshr v25.8h, v25.8h, #11 // ...........|.....................*................................................................... + // mls v8.8h, v25.8h, v7.h[0] // ...........|........................*................................................................ + // sqdmulh v25.8h, v12.8h, v7.h[1] // ...........|..........................................*.............................................. + // srshr v25.8h, v25.8h, #11 // ...........|..............................................*.......................................... + // mls v12.8h, v25.8h, v7.h[0] // ...........|.................................................*....................................... + // sub v24.8h, v8.8h, v12.8h // ...........|.....................................................*................................... + // add v8.8h, v8.8h, v12.8h // ...........|......................................................*.................................. + // mul v12.8h, v24.8h, v0.h[0] // ...........|........................................................*................................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|.........................................................*............................... + // mls v12.8h, v24.8h, v7.h[0] // ...........|.............................................................*........................... + // sub v24.8h, v9.8h, v13.8h // ...........|...........................................*............................................. + // add v9.8h, v9.8h, v13.8h // ...........|............................................*............................................ + // mul v13.8h, v24.8h, v0.h[0] // ...........|...............................................*......................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|................................................*........................................ + // mls v13.8h, v24.8h, v7.h[0] // ...........|....................................................*.................................... + // sub v24.8h, v10.8h, v14.8h // ...........|..................................................*...................................... + // add v10.8h, v10.8h, v14.8h // ...........|...................................................*..................................... + // mul v14.8h, v24.8h, v0.h[0] // ...........|.......................................................*................................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|..........................................................*.............................. + // mls v14.8h, v24.8h, v7.h[0] // ...........|..............................................................*.......................... + // sub v24.8h, v11.8h, v15.8h // ...........|...........................................................*............................. + // add v11.8h, v11.8h, v15.8h // ...........|............................................................*............................ + // mul v15.8h, v24.8h, v0.h[0] // ...........|...............................................................*......................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|................................................................*........................ + // mls v15.8h, v24.8h, v7.h[0] // ...........|....................................................................*.................... + // str q12, [x0, #(4*(512/8))] // ...........|.................................................................*....................... + // str q13, [x0, #(5*(512/8))] // ...........|...................................................................*..................... + // str q14, [x0, #(6*(512/8))] // ...........|.....................................................................*................... + // str q15, [x0, #(7*(512/8))] // ...........|........................................................................*................ + // mul v12.8h, v8.8h, v29.8h // ...........|..................................................................*...................... + // sqrdmulh v8.8h, v8.8h, v30.8h // ...........|......................................................................*.................. + // mls v12.8h, v8.8h, v7.h[0] // ...........|..........................................................................*.............. + // mul v13.8h, v9.8h, v29.8h // ...........|.......................................................................*................. + // sqrdmulh v9.8h, v9.8h, v30.8h // ...........|.........................................................................*............... + // mls v13.8h, v9.8h, v7.h[0] // ...........|.............................................................................*........... + // mul v14.8h, v10.8h, v29.8h // ...........|...........................................................................*............. + // sqrdmulh v10.8h, v10.8h, v30.8h // ...........|............................................................................*............ + // mls v14.8h, v10.8h, v7.h[0] // ...........|................................................................................*........ + // mul v15.8h, v11.8h, v29.8h // ...........|..............................................................................*.......... + // sqrdmulh v11.8h, v11.8h, v30.8h // ...........|...............................................................................*......... + // mls v15.8h, v11.8h, v7.h[0] // .*.........|...................................................................................*..... + // str q12, [x0], #(16) // ...........|.................................................................................*....... + // str q13, [x0, #(-16 + 1*(512/8))] // ..*........|....................................................................................*.... + // str q14, [x0, #(-16 + 2*(512/8))] // ....*......|......................................................................................*.. + // str q15, [x0, #(-16 + 3*(512/8))] // ......*....|........................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + ldr q28, [x0, #0] // *...................................................................................... + // gap // ....................................................................................... + // gap // ....................................................................................... + // gap // ....................................................................................... + sub v5.8H, v23.8H, v21.8H // .*..................................................................................... + // gap // ....................................................................................... + add v6.8H, v23.8H, v21.8H // ..*.................................................................................... + // gap // ....................................................................................... + add v2.8H, v28.8H, v20.8H // ....*.................................................................................. + // gap // ....................................................................................... + mul v23.8H, v5.8H, v1.H[0] // .....*................................................................................. + // gap // ....................................................................................... + sub v11.8H, v28.8H, v20.8H // ...*................................................................................... + // gap // ....................................................................................... + add v8.8H, v2.8H, v6.8H // ........*.............................................................................. + // gap // ....................................................................................... + sub v16.8H, v2.8H, v6.8H // .......*............................................................................... + // gap // ....................................................................................... + sqrdmulh v24.8H, v11.8H, v0.H[7] // ..........*............................................................................ + // gap // ....................................................................................... + sqdmulh v18.8H, v8.8H, v7.H[1] // ..................*.................................................................... + // gap // ....................................................................................... + mul v21.8H, v16.8H, v0.H[2] // ................*...................................................................... + // gap // ....................................................................................... + sqrdmulh v25.8H, v16.8H, v0.H[3] // .................*..................................................................... + // gap // ....................................................................................... + sub v6.8H, v17.8H, v4.8H // ............*.......................................................................... + // gap // ....................................................................................... + sqrdmulh v2.8H, v5.8H, v1.H[1] // ......*................................................................................ + // gap // ....................................................................................... + sub v26.8H, v13.8H, v3.8H // ....................*.................................................................. + // gap // ....................................................................................... + mls v21.8H, v25.8H, v7.H[0] // ...............................*....................................................... + // gap // ....................................................................................... + mul v20.8H, v6.8H, v1.H[2] // ...............*....................................................................... + // gap // ....................................................................................... + mls v23.8H, v2.8H, v7.H[0] // ...........*........................................................................... + // gap // ....................................................................................... + sqrdmulh v2.8H, v6.8H, v1.H[3] // ...................*................................................................... + // gap // ....................................................................................... + sqrdmulh v16.8H, v26.8H, v1.H[5] // ..............................*........................................................ + // gap // ....................................................................................... + mul v25.8H, v26.8H, v1.H[4] // ...........................*........................................................... + // gap // ....................................................................................... + add v10.8H, v13.8H, v3.8H // .....................*................................................................. + // gap // ....................................................................................... + mls v20.8H, v2.8H, v7.H[0] // .......................*............................................................... + // gap // ....................................................................................... + add v19.8H, v17.8H, v4.8H // .............*......................................................................... + // gap // ....................................................................................... + mls v25.8H, v16.8H, v7.H[0] // ..................................*.................................................... + // gap // ....................................................................................... + mul v3.8H, v11.8H, v0.H[6] // .........*............................................................................. + // gap // ....................................................................................... + sub v2.8H, v19.8H, v10.8H // ............................*.......................................................... + // gap // ....................................................................................... + add v10.8H, v19.8H, v10.8H // .............................*......................................................... + // gap // ....................................................................................... + sub v16.8H, v20.8H, v25.8H // ......................................*................................................ + // gap // ....................................................................................... + sqrdmulh v17.8H, v2.8H, v0.H[5] // ....................................*.................................................. + // gap // ....................................................................................... + mul v11.8H, v2.8H, v0.H[4] // ...................................*................................................... + // gap // ....................................................................................... + mul v6.8H, v16.8H, v0.H[4] // .........................................*............................................. + // gap // ....................................................................................... + sqrdmulh v2.8H, v16.8H, v0.H[5] // ..........................................*............................................ + // gap // ....................................................................................... + sqdmulh v26.8H, v10.8H, v7.H[1] // ...........................................*........................................... + // gap // ....................................................................................... + mls v11.8H, v17.8H, v7.H[0] // ........................................*.............................................. + // gap // ....................................................................................... + srshr v16.8H, v18.8H, #11 // ......................*................................................................ + // gap // ....................................................................................... + mls v6.8H, v2.8H, v7.H[0] // ..............................................*........................................ + // gap // ....................................................................................... + srshr v2.8H, v26.8H, #11 // ...............................................*....................................... + // gap // ....................................................................................... + sub v17.8H, v21.8H, v11.8H // ...................................................*................................... + // gap // ....................................................................................... + mls v8.8H, v16.8H, v7.H[0] // .........................*............................................................. + // gap // ....................................................................................... + mls v10.8H, v2.8H, v7.H[0] // ..................................................*.................................... + // gap // ....................................................................................... + mls v3.8H, v24.8H, v7.H[0] // ..............*........................................................................ + // gap // ....................................................................................... + add v31.8H, v20.8H, v25.8H // .......................................*............................................... + // gap // ....................................................................................... + add v25.8H, v21.8H, v11.8H // ....................................................*.................................. + // gap // ....................................................................................... + add v13.8H, v8.8H, v10.8H // .......................................................*............................... + // gap // ....................................................................................... + sub v16.8H, v3.8H, v23.8H // ........................*.............................................................. + // gap // ....................................................................................... + sqrdmulh v2.8H, v25.8H, v30.8H // .............................................................................*......... + // gap // ....................................................................................... + mul v21.8H, v25.8H, v29.8H // ............................................................................*.......... + // gap // ....................................................................................... + sqrdmulh v26.8H, v16.8H, v0.H[3] // .................................*..................................................... + // gap // ....................................................................................... + mul v19.8H, v16.8H, v0.H[2] // ................................*...................................................... + // gap // ....................................................................................... + sub v16.8H, v8.8H, v10.8H // ......................................................*................................ + // gap // ....................................................................................... + mls v21.8H, v2.8H, v7.H[0] // .................................................................................*..... + // gap // ....................................................................................... + add v3.8H, v3.8H, v23.8H // ..........................*............................................................ + // gap // ....................................................................................... + mls v19.8H, v26.8H, v7.H[0] // .....................................*................................................. + // gap // ....................................................................................... + mul v25.8H, v16.8H, v0.H[0] // .........................................................*............................. + // gap // ....................................................................................... + str q21, [x0, #128] // .....................................................................................*. + // gap // ....................................................................................... + sqrdmulh v4.8H, v16.8H, v0.H[1] // ..........................................................*............................ + // gap // ....................................................................................... + sub v26.8H, v19.8H, v6.8H // ............................................................*.......................... + // gap // ....................................................................................... + add v20.8H, v19.8H, v6.8H // .............................................................*......................... + // gap // ....................................................................................... + mul v11.8H, v17.8H, v0.H[0] // ........................................................*.............................. + // gap // ....................................................................................... + sub v15.8H, v3.8H, v31.8H // ............................................*.......................................... + // gap // ....................................................................................... + sqrdmulh v2.8H, v20.8H, v30.8H // ................................................................................*...... + // gap // ....................................................................................... + mul v16.8H, v20.8H, v29.8H // ...............................................................................*....... + // gap // ....................................................................................... + sqrdmulh v23.8H, v15.8H, v0.H[1] // .................................................*..................................... + // gap // ....................................................................................... + mul v20.8H, v15.8H, v0.H[0] // ................................................*...................................... + // gap // ....................................................................................... + sqrdmulh v17.8H, v17.8H, v0.H[1] // ...........................................................*........................... + // gap // ....................................................................................... + mls v16.8H, v2.8H, v7.H[0] // ...................................................................................*... + // gap // ....................................................................................... + mul v21.8H, v13.8H, v29.8H // ...................................................................*................... + // gap // ....................................................................................... + mls v20.8H, v23.8H, v7.H[0] // .....................................................*................................. + // gap // ....................................................................................... + sqrdmulh v23.8H, v13.8H, v30.8H // .......................................................................*............... + // gap // ....................................................................................... + str q16, [x0, #192] // ......................................................................................* + // gap // ....................................................................................... + add v13.8H, v3.8H, v31.8H // .............................................*......................................... + // gap // ....................................................................................... + str q20, [x0, #320] // ....................................................................*.................. + // gap // ....................................................................................... + mls v21.8H, v23.8H, v7.H[0] // ...........................................................................*........... + // gap // ....................................................................................... + sqrdmulh v2.8H, v13.8H, v30.8H // ..........................................................................*............ + // gap // ....................................................................................... + sqrdmulh v23.8H, v26.8H, v0.H[1] // .................................................................*..................... + // gap // ....................................................................................... + mul v26.8H, v26.8H, v0.H[0] // ................................................................*...................... + // gap // ....................................................................................... + mul v16.8H, v13.8H, v29.8H // ........................................................................*.............. + // gap // ....................................................................................... + mls v11.8H, v17.8H, v7.H[0] // ...............................................................*....................... + // gap // ....................................................................................... + str q21, [x0], #(16) // ..................................................................................*.... + // gap // ....................................................................................... + mls v26.8H, v23.8H, v7.H[0] // .....................................................................*................. + // gap // ....................................................................................... + mls v25.8H, v4.8H, v7.H[0] // ..............................................................*........................ + // gap // ....................................................................................... + str q11, [x0, #368] // ......................................................................*................ + // gap // ....................................................................................... + mls v16.8H, v2.8H, v7.H[0] // ..............................................................................*........ + // gap // ....................................................................................... + str q26, [x0, #432] // .........................................................................*............. + // gap // ....................................................................................... + // gap // ....................................................................................... + // gap // ....................................................................................... + str q25, [x0, #240] // ..................................................................*.................... + // gap // ....................................................................................... + // gap // ....................................................................................... + // gap // ....................................................................................... + str q16, [x0, #48] // ....................................................................................*.. + // gap // ....................................................................................... + + // original source code + // ldr q16, [x0, #0] // *...................................................................................... + // sub v2.8H, v23.8H, v21.8H // .*..................................................................................... + // add v23.8H, v23.8H, v21.8H // ..*.................................................................................... + // sub v21.8H, v16.8H, v20.8H // .....*................................................................................. + // add v16.8H, v16.8H, v20.8H // ...*................................................................................... + // mul v26.8H, v2.8H, v1.H[0] // ....*.................................................................................. + // sqrdmulh v2.8H, v2.8H, v1.H[1] // .............*......................................................................... + // sub v20.8H, v16.8H, v23.8H // .......*............................................................................... + // add v16.8H, v16.8H, v23.8H // ......*................................................................................ + // mul v23.8H, v21.8H, v0.H[6] // .........................*............................................................. + // sqrdmulh v21.8H, v21.8H, v0.H[7] // ........*.............................................................................. + // mls v26.8H, v2.8H, v7.H[0] // .................*..................................................................... + // sub v2.8H, v17.8H, v4.8H // ............*.......................................................................... + // add v17.8H, v17.8H, v4.8H // .......................*............................................................... + // mls v23.8H, v21.8H, v7.H[0] // .........................................*............................................. + // mul v21.8H, v2.8H, v1.H[2] // ................*...................................................................... + // mul v4.8H, v20.8H, v0.H[2] // ..........*............................................................................ + // sqrdmulh v20.8H, v20.8H, v0.H[3] // ...........*........................................................................... + // sqdmulh v25.8H, v16.8H, v7.H[1] // .........*............................................................................. + // sqrdmulh v2.8H, v2.8H, v1.H[3] // ..................*.................................................................... + // sub v11.8H, v13.8H, v3.8H // ..............*........................................................................ + // add v13.8H, v13.8H, v3.8H // .....................*................................................................. + // srshr v25.8H, v25.8H, #11 // ...................................*................................................... + // mls v21.8H, v2.8H, v7.H[0] // ......................*................................................................ + // sub v2.8H, v23.8H, v26.8H // .............................................*......................................... + // mls v16.8H, v25.8H, v7.H[0] // .......................................*............................................... + // add v23.8H, v23.8H, v26.8H // ....................................................*.................................. + // mul v26.8H, v11.8H, v1.H[4] // ....................*.................................................................. + // sub v25.8H, v17.8H, v13.8H // ..........................*............................................................ + // add v17.8H, v17.8H, v13.8H // ...........................*........................................................... + // sqrdmulh v11.8H, v11.8H, v1.H[5] // ...................*................................................................... + // mls v4.8H, v20.8H, v7.H[0] // ...............*....................................................................... + // mul v20.8H, v2.8H, v0.H[2] // .................................................*..................................... + // sqrdmulh v2.8H, v2.8H, v0.H[3] // ................................................*...................................... + // mls v26.8H, v11.8H, v7.H[0] // ........................*.............................................................. + // mul v11.8H, v25.8H, v0.H[4] // ..............................*........................................................ + // sqrdmulh v25.8H, v25.8H, v0.H[5] // .............................*......................................................... + // mls v20.8H, v2.8H, v7.H[0] // .....................................................*................................. + // sub v2.8H, v21.8H, v26.8H // ............................*.......................................................... + // add v21.8H, v21.8H, v26.8H // ..........................................*............................................ + // mls v11.8H, v25.8H, v7.H[0] // ..................................*.................................................... + // mul v26.8H, v2.8H, v0.H[4] // ...............................*....................................................... + // sqrdmulh v2.8H, v2.8H, v0.H[5] // ................................*...................................................... + // sqdmulh v25.8H, v17.8H, v7.H[1] // .................................*..................................................... + // sub v13.8H, v23.8H, v21.8H // ............................................................*.......................... + // add v23.8H, v23.8H, v21.8H // .......................................................................*............... + // mls v26.8H, v2.8H, v7.H[0] // ....................................*.................................................. + // srshr v2.8H, v25.8H, #11 // .....................................*................................................. + // mul v21.8H, v13.8H, v0.H[0] // ................................................................*...................... + // sqrdmulh v25.8H, v13.8H, v0.H[1] // ...............................................................*....................... + // mls v17.8H, v2.8H, v7.H[0] // ........................................*.............................................. + // sub v2.8H, v4.8H, v11.8H // ......................................*................................................ + // add v4.8H, v4.8H, v11.8H // ...........................................*........................................... + // mls v21.8H, v25.8H, v7.H[0] // ....................................................................*.................. + // sub v25.8H, v16.8H, v17.8H // ..................................................*.................................... + // add v16.8H, v16.8H, v17.8H // ............................................*.......................................... + // mul v17.8H, v2.8H, v0.H[0] // ...........................................................*........................... + // mul v11.8H, v25.8H, v0.H[0] // ......................................................*................................ + // sqrdmulh v25.8H, v25.8H, v0.H[1] // ........................................................*.............................. + // sqrdmulh v2.8H, v2.8H, v0.H[1] // .................................................................*..................... + // sub v13.8H, v20.8H, v26.8H // .........................................................*............................. + // add v26.8H, v20.8H, v26.8H // ..........................................................*............................ + // mls v11.8H, v25.8H, v7.H[0] // .................................................................................*..... + // mls v17.8H, v2.8H, v7.H[0] // ..............................................................................*........ + // mul v2.8H, v13.8H, v0.H[0] // ............................................................................*.......... + // sqrdmulh v20.8H, v13.8H, v0.H[1] // ...........................................................................*........... + // str q11, [x0, #256] // .....................................................................................*. + // mul v25.8H, v16.8H, v29.8H // ...................................................................*................... + // str q21, [x0, #320] // ........................................................................*.............. + // mls v2.8H, v20.8H, v7.H[0] // ................................................................................*...... + // str q17, [x0, #384] // ..................................................................................*.... + // sqrdmulh v16.8H, v16.8H, v30.8H // .....................................................................*................. + // mul v21.8H, v23.8H, v29.8H // .............................................................................*......... + // str q2, [x0, #448] // ....................................................................................*.. + // sqrdmulh v2.8H, v23.8H, v30.8H // ..........................................................................*............ + // mls v25.8H, v16.8H, v7.H[0] // .........................................................................*............. + // mul v16.8H, v4.8H, v29.8H // ...............................................*....................................... + // sqrdmulh v23.8H, v4.8H, v30.8H // ..............................................*........................................ + // mls v21.8H, v2.8H, v7.H[0] // ...................................................................................*... + // mul v2.8H, v26.8H, v29.8H // ..............................................................*........................ + // sqrdmulh v26.8H, v26.8H, v30.8H // .............................................................*......................... + // mls v16.8H, v23.8H, v7.H[0] // ...................................................*................................... + // str q25, [x0], #(16) // ...............................................................................*....... + // mls v2.8H, v26.8H, v7.H[0] // ..................................................................*.................... + // str q21, [x0, #48] // ......................................................................................* + // str q16, [x0, #112] // .......................................................*............................... + // str q2, [x0, #176] // ......................................................................*................ + + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_a72.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_a72.s new file mode 100644 index 0000000..820a734 --- /dev/null +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_a72.s @@ -0,0 +1,1823 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_manual_ld4_opt_a72 + .global _intt_kyber_123_4567_manual_ld4_opt_a72 + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_manual_ld4_opt_a72: +_intt_kyber_123_4567_manual_ld4_opt_a72: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 + ld4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1] // *.................................................... + ldr q10, [x4, #64] // .*................................................... + // gap // ..................................................... + ldr q6, [x4], #(6*16) // ...*................................................. + ldr q22, [x4, #-80] // ....*................................................ + // gap // ..................................................... + ldr q15, [x4, #-16] // .........*........................................... + // gap // ..................................................... + // gap // ..................................................... + ldr q4, [x3], #16 // ...............................*..................... + // gap // ..................................................... + // gap // ..................................................... + add v13.8H, v25.8H, v26.8H // ........*............................................ + ldr q1, [x4, #-48] // ..*.................................................. + sub v12.8H, v27.8H, v28.8H // ......*.............................................. + sub v8.8H, v25.8H, v26.8H // .....*............................................... + // gap // ..................................................... + // gap // ..................................................... + add v25.8H, v27.8H, v28.8H // ..........*.......................................... + // gap // ..................................................... + // gap // ..................................................... + sqrdmulh v21.8H, v12.8H, v15.8H // ...............*..................................... + ldr q15, [x4, #-64] // .......*............................................. + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sqrdmulh v24.8H, v8.8H, v1.8H // ...........*......................................... + sub v18.8H, v13.8H, v25.8H // ..............*...................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mul v11.8H, v12.8H, v10.8H // .................*................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mul v0.8H, v8.8H, v15.8H // ............*........................................ + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v0.8H, v24.8H, v7.H[0] // ................*.................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v11.8H, v21.8H, v7.H[0] // ..................*.................................. + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sqrdmulh v17.8H, v18.8H, v22.8H // ...................*................................. + add v15.8H, v13.8H, v25.8H // .............*....................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mul v5.8H, v18.8H, v6.8H // ....................*................................ + // gap // ..................................................... + // gap // ..................................................... + sub v29.8H, v0.8H, v11.8H // .....................*............................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sqrdmulh v18.8H, v29.8H, v22.8H // ........................*............................ + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mul v29.8H, v29.8H, v6.8H // ...........................*......................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v5.8H, v17.8H, v7.H[0] // .......................*............................. + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v29.8H, v18.8H, v7.H[0] // ............................*........................ + // gap // ..................................................... + // gap // ..................................................... + add v19.8H, v0.8H, v11.8H // ......................*.............................. + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + trn2 v20.4S, v15.4S, v19.4S // ..........................*.......................... + trn1 v25.4S, v15.4S, v19.4S // .........................*........................... + // gap // ..................................................... + trn2 v17.4S, v5.4S, v29.4S // ..............................*...................... + trn1 v15.4S, v5.4S, v29.4S // .............................*....................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + trn1 v31.2D, v20.2D, v17.2D // ...................................*................. + trn1 v30.2D, v25.2D, v15.2D // ..................................*.................. + // gap // ..................................................... + trn2 v17.2D, v20.2D, v17.2D // .................................*................... + trn2 v12.2D, v25.2D, v15.2D // ................................*.................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + add v20.8H, v30.8H, v31.8H // .....................................*............... + sub v27.8H, v30.8H, v31.8H // ......................................*.............. + // gap // ..................................................... + add v21.8H, v12.8H, v17.8H // ....................................*................ + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sqdmulh v18.8H, v20.8H, v7.H[1] // ........................................*............ + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sqdmulh v2.8H, v21.8H, v7.H[1] // .......................................*............. + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sqrdmulh v13.8H, v27.8H, v4.H[3] // ............................................*........ + // gap // ..................................................... + // gap // ..................................................... + srshr v15.8H, v18.8H, #11 // .............................................*....... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + srshr v22.8H, v2.8H, #11 // ...........................................*......... + // gap // ..................................................... + // gap // ..................................................... + mls v20.8H, v15.8H, v7.H[0] // ...............................................*..... + // gap // ..................................................... + // gap // ..................................................... + sub v23.8H, v12.8H, v17.8H // .........................................*........... + // gap // ..................................................... + // gap // ..................................................... + mls v21.8H, v22.8H, v7.H[0] // ..............................................*...... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mul v15.8H, v23.8H, v4.H[4] // ................................................*.... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mul v18.8H, v27.8H, v4.H[2] // ..........................................*.......... + // gap // ..................................................... + add v30.8H, v20.8H, v21.8H // ..................................................*.. + // gap // ..................................................... + // gap // ..................................................... + sqrdmulh v0.8H, v23.8H, v4.H[5] // .................................................*... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v18.8H, v13.8H, v7.H[0] // ...................................................*. + str q30, [x1], #(64) // ....................................................* + // gap // ..................................................... + + // original source code + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // *.................................................... + // ldr q3, [x4, #64] // .*................................................... + // ldr q1, [x4, #48] // .......*............................................. + // ldr q29, [x4], #(6*16) // ..*.................................................. + // ldr q19, [x4, #-80] // ...*................................................. + // sub v5.8H, v9.8H, v10.8H // .........*........................................... + // sub v27.8H, v11.8H, v12.8H // ........*............................................ + // ldr q23, [x4, #-64] // ............*........................................ + // add v25.8H, v9.8H, v10.8H // ......*.............................................. + // ldr q24, [x4, #-16] // ....*................................................ + // add v0.8H, v11.8H, v12.8H // ..........*.......................................... + // sqrdmulh v17.8H, v5.8H, v1.8H // .............*....................................... + // mul v14.8H, v5.8H, v23.8H // ................*.................................... + // add v22.8H, v25.8H, v0.8H // ....................*................................ + // sub v30.8H, v25.8H, v0.8H // ..............*...................................... + // sqrdmulh v9.8H, v27.8H, v24.8H // ...........*......................................... + // mls v14.8H, v17.8H, v7.H[0] // .................*................................... + // mul v0.8H, v27.8H, v3.8H // ...............*..................................... + // mls v0.8H, v9.8H, v7.H[0] // ..................*.................................. + // sqrdmulh v31.8H, v30.8H, v19.8H // ...................*................................. + // mul v3.8H, v30.8H, v29.8H // .....................*............................... + // sub v8.8H, v14.8H, v0.8H // ......................*.............................. + // add v9.8H, v14.8H, v0.8H // ...........................*......................... + // mls v3.8H, v31.8H, v7.H[0] // .........................*........................... + // sqrdmulh v0.8H, v8.8H, v19.8H // .......................*............................. + // trn1 v11.4S, v22.4S, v9.4S // .............................*....................... + // trn2 v27.4S, v22.4S, v9.4S // ............................*........................ + // mul v28.8H, v8.8H, v29.8H // ........................*............................ + // mls v28.8H, v0.8H, v7.H[0] // ..........................*.......................... + // trn1 v21.4S, v3.4S, v28.4S // ...............................*..................... + // trn2 v15.4S, v3.4S, v28.4S // ..............................*...................... + // ldr q4, [x3], #16 // .....*............................................... + // trn2 v26.2D, v11.2D, v21.2D // ...................................*................. + // trn2 v25.2D, v27.2D, v15.2D // ..................................*.................. + // trn1 v30.2D, v11.2D, v21.2D // .................................*................... + // trn1 v17.2D, v27.2D, v15.2D // ................................*.................... + // add v21.8H, v26.8H, v25.8H // ......................................*.............. + // add v20.8H, v30.8H, v17.8H // ....................................*................ + // sub v16.8H, v30.8H, v17.8H // .....................................*............... + // sqdmulh v27.8H, v21.8H, v7.H[1] // ........................................*............ + // sqdmulh v23.8H, v20.8H, v7.H[1] // .......................................*............. + // sub v26.8H, v26.8H, v25.8H // .............................................*....... + // mul v18.8H, v16.8H, v4.H[2] // ................................................*.... + // srshr v2.8H, v27.8H, #11 // ...........................................*......... + // sqrdmulh v14.8H, v16.8H, v4.H[3] // .........................................*........... + // srshr v16.8H, v23.8H, #11 // ..........................................*.......... + // mls v21.8H, v2.8H, v7.H[0] // ..............................................*...... + // mls v20.8H, v16.8H, v7.H[0] // ............................................*........ + // mul v15.8H, v26.8H, v4.H[4] // ...............................................*..... + // sqrdmulh v0.8H, v26.8H, v4.H[5] // ..................................................*.. + // add v2.8H, v20.8H, v21.8H // .................................................*... + // mls v18.8H, v14.8H, v7.H[0] // ...................................................*. + // str q2, [x1], #(64) // ....................................................* + + sub count, count, #1 +layer4567_start: + ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // e................................................................. + sub v2.8H, v20.8H, v21.8H // ....................................................*............. + // gap // .................................................................. + ldr q3, [x4, #64] // .....e............................................................ + mls v15.8H, v0.8H, v7.H[0] // .............................................*.................... + // gap // .................................................................. + // gap // .................................................................. + ldr q1, [x4, #48] // ....e............................................................. + // gap // .................................................................. + ldr q29, [x4], #(6*16) // .e................................................................ + sqrdmulh v16.8H, v2.8H, v4.H[1] // .......................................................*.......... + // gap // .................................................................. + ldr q19, [x4, #-80] // ..e............................................................... + sub v5.8H, v9.8H, v10.8H // .......e.......................................................... + // gap // .................................................................. + sub v27.8H, v11.8H, v12.8H // ............e..................................................... + ldr q23, [x4, #-64] // ...e.............................................................. + mul v2.8H, v2.8H, v4.H[0] // ......................................................*........... + add v25.8H, v9.8H, v10.8H // ........e......................................................... + ldr q24, [x4, #-16] // ......e........................................................... + // gap // .................................................................. + add v0.8H, v11.8H, v12.8H // .............e.................................................... + sqrdmulh v17.8H, v5.8H, v1.8H // ..........e....................................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mul v14.8H, v5.8H, v23.8H // .........e........................................................ + // gap // .................................................................. + // gap // .................................................................. + add v22.8H, v25.8H, v0.8H // ..................e............................................... + // gap // .................................................................. + // gap // .................................................................. + sub v30.8H, v25.8H, v0.8H // .................e................................................ + sqrdmulh v9.8H, v27.8H, v24.8H // ...............e.................................................. + // gap // .................................................................. + sub v23.8H, v18.8H, v15.8H // .........................................................*........ + // gap // .................................................................. + // gap // .................................................................. + mls v14.8H, v17.8H, v7.H[0] // ...........e...................................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mul v0.8H, v27.8H, v3.8H // ..............e................................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mls v0.8H, v9.8H, v7.H[0] // ................e................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sqrdmulh v31.8H, v30.8H, v19.8H // ....................e............................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mul v3.8H, v30.8H, v29.8H // ...................e.............................................. + // gap // .................................................................. + // gap // .................................................................. + sub v8.8H, v14.8H, v0.8H // ......................e........................................... + // gap // .................................................................. + // gap // .................................................................. + add v9.8H, v14.8H, v0.8H // .......................e.......................................... + // gap // .................................................................. + // gap // .................................................................. + mls v3.8H, v31.8H, v7.H[0] // .....................e............................................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sqrdmulh v0.8H, v8.8H, v19.8H // .........................e........................................ + trn1 v11.4S, v22.4S, v9.4S // ...........................e...................................... + // gap // .................................................................. + trn2 v27.4S, v22.4S, v9.4S // ............................e..................................... + // gap // .................................................................. + // gap // .................................................................. + mul v28.8H, v8.8H, v29.8H // ........................e......................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mls v28.8H, v0.8H, v7.H[0] // ..........................e....................................... + add v0.8H, v18.8H, v15.8H // ..........................................................*....... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sqrdmulh v10.8H, v23.8H, v4.H[1] // ............................................................*..... + // gap // .................................................................. + // gap // .................................................................. + str q0, [x1, #-48] // ...............................................................*.. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn1 v21.4S, v3.4S, v28.4S // .............................e.................................... + trn2 v15.4S, v3.4S, v28.4S // ..............................e................................... + // gap // .................................................................. + mul v0.8H, v23.8H, v4.H[0] // ...........................................................*...... + ldr q4, [x3], #16 // ...................................e.............................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn2 v26.2D, v11.2D, v21.2D // ...............................e.................................. + trn2 v25.2D, v27.2D, v15.2D // ................................e................................. + // gap // .................................................................. + trn1 v30.2D, v11.2D, v21.2D // .................................e................................ + mls v0.8H, v10.8H, v7.H[0] // .............................................................*.... + // gap // .................................................................. + trn1 v17.2D, v27.2D, v15.2D // ..................................e............................... + // gap // .................................................................. + // gap // .................................................................. + mls v2.8H, v16.8H, v7.H[0] // ........................................................*......... + add v21.8H, v26.8H, v25.8H // ..........................................e....................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + add v20.8H, v30.8H, v17.8H // .....................................e............................ + // gap // .................................................................. + // gap // .................................................................. + sub v16.8H, v30.8H, v17.8H // ....................................e............................. + sqdmulh v27.8H, v21.8H, v7.H[1] // .................................................e................ + str q0, [x1, #-16] // .................................................................* + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sqdmulh v23.8H, v20.8H, v7.H[1] // ..............................................e................... + sub v26.8H, v26.8H, v25.8H // .........................................e........................ + str q2, [x1, #-32] // ................................................................*. + // gap // .................................................................. + // gap // .................................................................. + mul v18.8H, v16.8H, v4.H[2] // ......................................e........................... + // gap // .................................................................. + // gap // .................................................................. + srshr v2.8H, v27.8H, #11 // ..................................................e............... + // gap // .................................................................. + // gap // .................................................................. + sqrdmulh v14.8H, v16.8H, v4.H[3] // .......................................e.......................... + // gap // .................................................................. + // gap // .................................................................. + srshr v16.8H, v23.8H, #11 // ...............................................e.................. + // gap // .................................................................. + // gap // .................................................................. + mls v21.8H, v2.8H, v7.H[0] // ...................................................e.............. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mls v20.8H, v16.8H, v7.H[0] // ................................................e................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mul v15.8H, v26.8H, v4.H[4] // ...........................................e...................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sqrdmulh v0.8H, v26.8H, v4.H[5] // ............................................e..................... + add v2.8H, v20.8H, v21.8H // .....................................................e............ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mls v18.8H, v14.8H, v7.H[0] // ........................................e......................... + // gap // .................................................................. + // gap // .................................................................. + str q2, [x1], #(64) // ..............................................................e... + // gap // .................................................................. + // gap // .................................................................. + + // original source code + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // e.................................................................e...................................................... + // ldr q0, [x4], #(6*16) // .....e............................................................|....e................................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // .......e..........................................................|......e............................................... + // ldr q1, [x4, #(-6*16 + 2*16)] // ..........e.......................................................|.........e............................................ + // ldr q5, [x4, #(-6*16 + 3*16)] // ....e.............................................................|...e.................................................. + // ldr q2, [x4, #(-6*16 + 4*16)] // ..e...............................................................|.e.................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // .............e....................................................|............e......................................... + // sub v24.8h, v8.8h, v9.8h // ........e.........................................................|.......e.............................................. + // add v8.8h, v8.8h, v9.8h // ............e.....................................................|...........e.......................................... + // mul v9.8h, v24.8h, v1.8h // ................e.................................................|...............e...................................... + // sqrdmulh v24.8h, v24.8h, v5.8h // ...............e..................................................|..............e....................................... + // mls v9.8h, v24.8h, v7.h[0] // .....................e............................................|....................e................................. + // sub v24.8h, v10.8h, v11.8h // .........e........................................................|........e............................................. + // add v10.8h, v10.8h, v11.8h // ..............e...................................................|.............e........................................ + // mul v11.8h, v24.8h, v2.8h // ......................e...........................................|.....................e................................ + // sqrdmulh v24.8h, v24.8h, v6.8h // ...................e..............................................|..................e................................... + // mls v11.8h, v24.8h, v7.h[0] // .......................e..........................................|......................e............................... + // sub v24.8h, v8.8h, v10.8h // ..................e...............................................|.................e.................................... + // add v8.8h, v8.8h, v10.8h // .................e................................................|................e..................................... + // mul v10.8h, v24.8h, v0.8h // .........................e........................................|........................e............................. + // sqrdmulh v24.8h, v24.8h, v4.8h // ........................e.........................................|.......................e.............................. + // mls v10.8h, v24.8h, v7.h[0] // ............................e.....................................|...........................e.......................... + // sub v24.8h, v9.8h, v11.8h // ..........................e.......................................|.........................e............................ + // add v9.8h, v9.8h, v11.8h // ...........................e......................................|..........................e........................... + // mul v11.8h, v24.8h, v0.8h // ................................e.................................|...............................e...................... + // sqrdmulh v24.8h, v24.8h, v4.8h // .............................e....................................|............................e......................... + // mls v11.8h, v24.8h, v7.h[0] // .................................e................................|................................e..................... + // trn1 v25.4s, v8.4s, v9.4s // ..............................e...................................|.............................e........................ + // trn2 v26.4s, v8.4s, v9.4s // ...............................e..................................|..............................e....................... + // trn1 v27.4s, v10.4s, v11.4s // .....................................e............................|....................................e................. + // trn2 v28.4s, v10.4s, v11.4s // ......................................e...........................|.....................................e................ + // trn2 v10.2d, v25.2d, v27.2d // .........................................e........................|........................................e............. + // trn2 v11.2d, v26.2d, v28.2d // ..........................................e.......................|.........................................e............ + // trn1 v8.2d, v25.2d, v27.2d // ...........................................e......................|..........................................e........... + // trn1 v9.2d, v26.2d, v28.2d // .............................................e....................|............................................e......... + // ldr q0, [x3], #16 // ........................................e.........................|.......................................e.............. + // sub v24.8h, v8.8h, v9.8h // .................................................e................|................................................e..... + // add v8.8h, v8.8h, v9.8h // ................................................e.................|...............................................e...... + // mul v9.8h, v24.8h, v0.h[2] // .......................................................e..........|...................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .........................................................e........|...................................................... + // mls v9.8h, v24.8h, v7.h[0] // ................................................................e.|...................................................... + // sub v24.8h, v10.8h, v11.8h // .....................................................e............|....................................................e. + // add v10.8h, v10.8h, v11.8h // ...............................................e..................|..............................................e....... + // mul v11.8h, v24.8h, v0.h[4] // .............................................................e....|...................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..............................................................e...|...................................................... + // mls v11.8h, v24.8h, v7.h[0] // ...*..............................................................|..*................................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ....................................................e.............|...................................................e.. + // srshr v25.8h, v25.8h, #11 // ..........................................................e.......|...................................................... + // mls v8.8h, v25.8h, v7.h[0] // ............................................................e.....|...................................................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ..................................................e...............|.................................................e.... + // srshr v25.8h, v25.8h, #11 // ........................................................e.........|...................................................... + // mls v10.8h, v25.8h, v7.h[0] // ...........................................................e......|...................................................... + // sub v24.8h, v8.8h, v10.8h // .*................................................................|*..................................................... + // add v8.8h, v8.8h, v10.8h // ...............................................................e..|...................................................... + // mul v10.8h, v24.8h, v0.h[0] // ...........*......................................................|..........*........................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ......*...........................................................|.....*................................................ + // mls v10.8h, v24.8h, v7.h[0] // ..............................................*...................|.............................................*........ + // sub v24.8h, v9.8h, v11.8h // ....................*.............................................|...................*.................................. + // add v9.8h, v9.8h, v11.8h // ..................................*...............................|.................................*.................... + // mul v11.8h, v24.8h, v0.h[0] // .......................................*..........................|......................................*............... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...................................*..............................|..................................*................... + // mls v11.8h, v24.8h, v7.h[0] // ............................................*.....................|...........................................*.......... + // str q8, [x1], #(64) // .................................................................e|...................................................... + // str q9, [x1, #(-64 + 16*1)] // ....................................*.............................|...................................*.................. + // str q10, [x1, #(-64 + 16*2)] // ......................................................*...........|.....................................................* + // str q11, [x1, #(-64 + 16*3)] // ...................................................*..............|..................................................*... + + sub count, count, #1 + cbnz count, layer4567_start + // gap // ............. + // gap // ............. + mls v15.8H, v0.8H, v7.H[0] // .*........... + sub v25.8H, v20.8H, v21.8H // *............ + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + sqrdmulh v17.8H, v25.8H, v4.H[1] // ..*.......... + // gap // ............. + // gap // ............. + sub v26.8H, v18.8H, v15.8H // ....*........ + // gap // ............. + // gap // ............. + add v19.8H, v18.8H, v15.8H // .....*....... + mul v23.8H, v25.8H, v4.H[0] // ...*......... + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + sqrdmulh v8.8H, v26.8H, v4.H[1] // ......*...... + // gap // ............. + // gap // ............. + str q19, [x1, #-48] // .......*..... + // gap // ............. + // gap // ............. + // gap // ............. + mul v4.8H, v26.8H, v4.H[0] // ........*.... + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + mls v23.8H, v17.8H, v7.H[0] // ..........*.. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + mls v4.8H, v8.8H, v7.H[0] // .........*... + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + str q23, [x1, #-32] // ............* + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + str q4, [x1, #-16] // ...........*. + // gap // ............. + // gap // ............. + + // original source code + // sub v2.8H, v20.8H, v21.8H // .*........... + // mls v15.8H, v0.8H, v7.H[0] // *............ + // sqrdmulh v16.8H, v2.8H, v4.H[1] // ..*.......... + // mul v2.8H, v2.8H, v4.H[0] // .....*....... + // sub v23.8H, v18.8H, v15.8H // ...*......... + // add v0.8H, v18.8H, v15.8H // ....*........ + // sqrdmulh v10.8H, v23.8H, v4.H[1] // ......*...... + // str q0, [x1, #-48] // .......*..... + // mul v0.8H, v23.8H, v4.H[0] // ........*.... + // mls v0.8H, v10.8H, v7.H[0] // ..........*.. + // mls v2.8H, v16.8H, v7.H[0] // .........*... + // str q0, [x1, #-16] // ............* + // str q2, [x1, #-32] // ...........*. + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + ldr q25, [x0, #192] // ...*........... + ldr q17, [x0, #128] // ....*.......... + // gap // ............... + ldr q3, [x0, #448] // *.............. + // gap // ............... + // gap // ............... + ldr q6, [x0, #384] // .*............. + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + sub v21.8H, v17.8H, v25.8H // .....*......... + // gap // ............... + // gap // ............... + ldr q22, [x0, #320] // ..*............ + // gap // ............... + // gap // ............... + ldr q4, [x0, #64] // ......*........ + sub v2.8H, v6.8H, v3.8H // .......*....... + // gap // ............... + sqrdmulh v23.8H, v21.8H, v1.H[1] // .........*..... + ldr q16, [x0, #0] // ........*...... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + mul v13.8H, v21.8H, v1.H[0] // ..........*.... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + mul v15.8H, v2.8H, v1.H[4] // ............*.. + add v28.8H, v16.8H, v4.8H // ..............* + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + mls v13.8H, v23.8H, v7.H[0] // .............*. + ldr q23, [x0, #256] // ...........*... + // gap // ............... + + // original source code + // ldr q3, [x0, #448] // ..*............ + // ldr q6, [x0, #384] // ...*........... + // ldr q22, [x0, #320] // .....*......... + // ldr q25, [x0, #192] // *.............. + // ldr q17, [x0, #128] // .*............. + // sub v31.8H, v17.8H, v25.8H // ....*.......... + // ldr q4, [x0, #64] // ......*........ + // sub v2.8H, v6.8H, v3.8H // .......*....... + // ldr q16, [x0, #0] // .........*..... + // sqrdmulh v26.8H, v31.8H, v1.H[1] // ........*...... + // mul v13.8H, v31.8H, v1.H[0] // ..........*.... + // ldr q23, [x0, #256] // ..............* + // mul v15.8H, v2.8H, v1.H[4] // ...........*... + // mls v13.8H, v26.8H, v7.H[0] // .............*. + // add v28.8H, v16.8H, v4.8H // ............*.. + + sub count, count, #1 +layer123_start: + sub v16.8H, v16.8H, v4.8H // ........*..................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v4.8H, v23.8H, v22.8H // ..................*........................................................................... + // gap // .............................................................................................. + sqrdmulh v2.8H, v2.8H, v1.H[5] // ..........................*................................................................... + add v23.8H, v23.8H, v22.8H // ...................*.......................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v19.8H, v16.8H, v0.H[6] // ..........*................................................................................... + add v20.8H, v6.8H, v3.8H // ........................*..................................................................... + ldr q3, [x0, #464] // .......e...................................................................................... + add v17.8H, v17.8H, v25.8H // ..............*............................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v27.8H, v16.8H, v0.H[7] // ...........*.................................................................................. + ldr q6, [x0, #400] // ......e....................................................................................... + // gap // .............................................................................................. + sub v25.8H, v23.8H, v20.8H // ......................................*....................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + add v24.8H, v23.8H, v20.8H // .......................................*...................................................... + mul v16.8H, v4.8H, v1.H[2] // ....................*......................................................................... + // gap // .............................................................................................. + add v20.8H, v28.8H, v17.8H // .............................*................................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v21.8H, v4.8H, v1.H[3] // .....................*........................................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v15.8H, v2.8H, v7.H[0] // ...........................*.................................................................. + sub v2.8H, v28.8H, v17.8H // ............................*................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v19.8H, v27.8H, v7.H[0] // ............*................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v16.8H, v21.8H, v7.H[0] // ......................*....................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v4.8H, v2.8H, v0.H[2] // ..............................*............................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v26.8H, v19.8H, v13.8H // .................................*............................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + add v9.8H, v19.8H, v13.8H // ..................................*........................................................... + mul v18.8H, v25.8H, v0.H[4] // ........................................*..................................................... + // gap // .............................................................................................. + sub v10.8H, v16.8H, v15.8H // ...........................................*.................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v31.8H, v25.8H, v0.H[5] // .........................................*.................................................... + add v16.8H, v16.8H, v15.8H // ............................................*................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v2.8H, v2.8H, v0.H[3] // ...............................*.............................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v25.8H, v9.8H, v16.8H // ...........................................................*.................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + add v16.8H, v9.8H, v16.8H // ............................................................*................................. + sqdmulh v21.8H, v20.8H, v7.H[1] // ................................................*............................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqdmulh v23.8H, v24.8H, v7.H[1] // ...................................................*.......................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v4.8H, v2.8H, v7.H[0] // ................................*............................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + srshr v2.8H, v21.8H, #11 // .................................................*............................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v17.8H, v26.8H, v0.H[2] // ...................................*.......................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + srshr v23.8H, v23.8H, #11 // ....................................................*......................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v20.8H, v2.8H, v7.H[0] // ..................................................*........................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v24.8H, v23.8H, v7.H[0] // .....................................................*........................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v22.8H, v26.8H, v0.H[3] // ....................................*......................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v18.8H, v31.8H, v7.H[0] // ..........................................*................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v2.8H, v20.8H, v24.8H // ......................................................*....................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + add v5.8H, v20.8H, v24.8H // .......................................................*...................................... + mul v21.8H, v10.8H, v0.H[4] // .............................................*................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v17.8H, v22.8H, v7.H[0] // .....................................*........................................................ + ldr q22, [x0, #336] // .....e........................................................................................ + // gap // .............................................................................................. + sub v20.8H, v4.8H, v18.8H // ................................................................*............................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v31.8H, v10.8H, v0.H[5] // ..............................................*............................................... + add v26.8H, v4.8H, v18.8H // .................................................................*............................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v11.8H, v2.8H, v0.H[0] // ........................................................*..................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v2.8H, v2.8H, v0.H[1] // .........................................................*.................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v21.8H, v31.8H, v7.H[0] // ...............................................*.............................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v4.8H, v25.8H, v0.H[1] // ..............................................................*............................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v25.8H, v25.8H, v0.H[0] // .............................................................*................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v11.8H, v2.8H, v7.H[0] // ..........................................................*................................... + sub v2.8H, v17.8H, v21.8H // .....................................................................*........................ + // gap // .............................................................................................. + add v21.8H, v17.8H, v21.8H // ......................................................................*....................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v25.8H, v4.8H, v7.H[0] // ...............................................................*.............................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v17.8H, v20.8H, v0.H[0] // ..................................................................*........................... + // gap // .............................................................................................. + // gap // .............................................................................................. + str q11, [x0, #256] // ..........................................................................*................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v10.8H, v20.8H, v0.H[1] // ...................................................................*.......................... + // gap // .............................................................................................. + // gap // .............................................................................................. + str q25, [x0, #320] // ...........................................................................*.................. + // gap // .............................................................................................. + ldr q25, [x0, #208] // ...e.......................................................................................... + mul v4.8H, v2.8H, v0.H[0] // .......................................................................*...................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v2.8H, v2.8H, v0.H[1] // ........................................................................*..................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v17.8H, v10.8H, v7.H[0] // ....................................................................*......................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v20.8H, v5.8H, v29.8H // ..............................................................................*............... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v4.8H, v2.8H, v7.H[0] // .........................................................................*.................... + // gap // .............................................................................................. + // gap // .............................................................................................. + str q17, [x0, #384] // ............................................................................*................. + ldr q17, [x0, #144] // ..e........................................................................................... + // gap // .............................................................................................. + sqrdmulh v28.8H, v5.8H, v30.8H // ...............................................................................*.............. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v2.8H, v16.8H, v29.8H // .................................................................................*............ + // gap // .............................................................................................. + // gap // .............................................................................................. + str q4, [x0, #448] // .............................................................................*................ + sub v31.8H, v17.8H, v25.8H // .............e................................................................................ + ldr q4, [x0, #80] // .e............................................................................................ + sqrdmulh v16.8H, v16.8H, v30.8H // ..................................................................................*........... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v20.8H, v28.8H, v7.H[0] // ................................................................................*............. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v23.8H, v26.8H, v29.8H // ....................................................................................*......... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v2.8H, v16.8H, v7.H[0] // ...................................................................................*.......... + // gap // .............................................................................................. + // gap // .............................................................................................. + str q20, [x0], #(16) // ..........................................................................................*... + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v26.8H, v26.8H, v30.8H // .....................................................................................*........ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v16.8H, v21.8H, v30.8H // ........................................................................................*..... + // gap // .............................................................................................. + // gap // .............................................................................................. + str q2, [x0, #48] // ...........................................................................................*.. + sub v2.8H, v6.8H, v3.8H // .......................e...................................................................... + // gap // .............................................................................................. + mul v21.8H, v21.8H, v29.8H // .......................................................................................*...... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v23.8H, v26.8H, v7.H[0] // ......................................................................................*....... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v21.8H, v16.8H, v7.H[0] // .........................................................................................*.... + ldr q16, [x0, #0] // e............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v26.8H, v31.8H, v1.H[1] // ................e............................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v13.8H, v31.8H, v1.H[0] // ...............e.............................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + str q21, [x0, #176] // .............................................................................................* + // gap // .............................................................................................. + // gap // .............................................................................................. + str q23, [x0, #112] // ............................................................................................*. + ldr q23, [x0, #256] // ....e......................................................................................... + mul v15.8H, v2.8H, v1.H[4] // .........................e.................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v13.8H, v26.8H, v7.H[0] // .................e............................................................................ + // gap // .............................................................................................. + add v28.8H, v16.8H, v4.8H // .........e.................................................................................... + + // original source code + // ldr q8, [x0, #0] // ...............................................................................e........|....................................................................................e.... + // ldr q9, [x0, #(1*(512/8))] // ..................................................................e.....................|.......................................................................e................. + // ldr q10, [x0, #(2*(512/8))] // .............................................................e..........................|..................................................................e...................... + // ldr q11, [x0, #(3*(512/8))] // ......................................................e.................................|...........................................................e............................. + // ldr q12, [x0, #(4*(512/8))] // ....................................................................................e...|......................................................................................... + // ldr q13, [x0, #(5*(512/8))] // .....................................e..................................................|..........................................e.............................................. + // ldr q14, [x0, #(6*(512/8))] // ...e....................................................................................|........e................................................................................ + // ldr q15, [x0, #(7*(512/8))] // e.......................................................................................|.....e................................................................................... + // sub v24.8h, v8.8h, v9.8h // ........................................................................................*......................................................................................... + // add v8.8h, v8.8h, v9.8h // .......................................................................................e|......................................................................................... + // mul v9.8h, v24.8h, v0.h[6] // ........................................................................................|...*..................................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // ..*.....................................................................................|.......*................................................................................. + // mls v9.8h, v24.8h, v7.h[0] // ...........*............................................................................|................*........................................................................ + // sub v24.8h, v10.8h, v11.8h // .................................................................e......................|......................................................................e.................. + // add v10.8h, v10.8h, v11.8h // .*......................................................................................|......*.................................................................................. + // mul v11.8h, v24.8h, v1.h[0] // .................................................................................e......|......................................................................................e.. + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ................................................................................e.......|.....................................................................................e... + // mls v11.8h, v24.8h, v7.h[0] // ......................................................................................e.|......................................................................................... + // sub v24.8h, v12.8h, v13.8h // ........................................................................................|*........................................................................................ + // add v12.8h, v12.8h, v13.8h // ........................................................................................|..*...................................................................................... + // mul v13.8h, v24.8h, v1.h[2] // ......*.................................................................................|...........*............................................................................. + // sqrdmulh v24.8h, v24.8h, v1.h[3] // ........*...............................................................................|.............*........................................................................... + // mls v13.8h, v24.8h, v7.h[0] // ............*...........................................................................|.................*....................................................................... + // sub v24.8h, v14.8h, v15.8h // ...........................................................................e............|................................................................................e........ + // add v14.8h, v14.8h, v15.8h // ........................................................................................|....*.................................................................................... + // mul v15.8h, v24.8h, v1.h[4] // .....................................................................................e..|......................................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[5] // ........................................................................................|.*....................................................................................... + // mls v15.8h, v24.8h, v7.h[0] // .........*..............................................................................|..............*.......................................................................... + // sub v24.8h, v8.8h, v10.8h // ..........*.............................................................................|...............*......................................................................... + // add v8.8h, v8.8h, v10.8h // .......*................................................................................|............*............................................................................ + // mul v10.8h, v24.8h, v0.h[2] // .............*..........................................................................|..................*...................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ....................*...................................................................|.........................*............................................................... + // mls v10.8h, v24.8h, v7.h[0] // .........................*..............................................................|..............................*.......................................................... + // sub v24.8h, v9.8h, v11.8h // ..............*.........................................................................|...................*..................................................................... + // add v9.8h, v9.8h, v11.8h // ...............*........................................................................|....................*.................................................................... + // mul v11.8h, v24.8h, v0.h[2] // ...........................*............................................................|................................*........................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...............................*........................................................|....................................*.................................................... + // mls v11.8h, v24.8h, v7.h[0] // ....................................*...................................................|.........................................*............................................... + // sub v24.8h, v12.8h, v14.8h // ....*...................................................................................|.........*............................................................................... + // add v12.8h, v12.8h, v14.8h // .....*..................................................................................|..........*.............................................................................. + // mul v14.8h, v24.8h, v0.h[4] // ................*.......................................................................|.....................*................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..................*.....................................................................|.......................*................................................................. + // mls v14.8h, v24.8h, v7.h[0] // ................................*.......................................................|.....................................*................................................... + // sub v24.8h, v13.8h, v15.8h // .................*......................................................................|......................*.................................................................. + // add v13.8h, v13.8h, v15.8h // ...................*....................................................................|........................*................................................................ + // mul v15.8h, v24.8h, v0.h[4] // ...................................*....................................................|........................................*................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .......................................*................................................|............................................*............................................ + // mls v15.8h, v24.8h, v7.h[0] // ...........................................*............................................|................................................*........................................ + // sqdmulh v25.8h, v8.8h, v7.h[1] // .......................*................................................................|............................*............................................................ + // srshr v25.8h, v25.8h, #11 // ..........................*.............................................................|...............................*......................................................... + // mls v8.8h, v25.8h, v7.h[0] // .............................*..........................................................|..................................*...................................................... + // sqdmulh v25.8h, v12.8h, v7.h[1] // ........................*...............................................................|.............................*........................................................... + // srshr v25.8h, v25.8h, #11 // ............................*...........................................................|.................................*....................................................... + // mls v12.8h, v25.8h, v7.h[0] // ..............................*.........................................................|...................................*..................................................... + // sub v24.8h, v8.8h, v12.8h // .................................*......................................................|......................................*.................................................. + // add v8.8h, v8.8h, v12.8h // ..................................*.....................................................|.......................................*................................................. + // mul v12.8h, v24.8h, v0.h[0] // .........................................*..............................................|..............................................*.......................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........................................*.............................................|...............................................*......................................... + // mls v12.8h, v24.8h, v7.h[0] // ..............................................*.........................................|...................................................*..................................... + // sub v24.8h, v9.8h, v13.8h // .....................*..................................................................|..........................*.............................................................. + // add v9.8h, v9.8h, v13.8h // ......................*.................................................................|...........................*............................................................. + // mul v13.8h, v24.8h, v0.h[0] // .............................................*..........................................|..................................................*...................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ............................................*...........................................|.................................................*....................................... + // mls v13.8h, v24.8h, v7.h[0] // .................................................*......................................|......................................................*.................................. + // sub v24.8h, v10.8h, v14.8h // ......................................*.................................................|...........................................*............................................. + // add v10.8h, v10.8h, v14.8h // ........................................*...............................................|.............................................*........................................... + // mul v14.8h, v24.8h, v0.h[0] // ..................................................*.....................................|.......................................................*................................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ....................................................*...................................|.........................................................*............................... + // mls v14.8h, v24.8h, v7.h[0] // .........................................................*..............................|..............................................................*.......................... + // sub v24.8h, v11.8h, v15.8h // ...............................................*........................................|....................................................*.................................... + // add v11.8h, v11.8h, v15.8h // ................................................*.......................................|.....................................................*................................... + // mul v15.8h, v24.8h, v0.h[0] // .......................................................*................................|............................................................*............................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................................................*...............................|.............................................................*........................... + // mls v15.8h, v24.8h, v7.h[0] // ...........................................................*............................|................................................................*........................ + // str q12, [x0, #(4*(512/8))] // ...................................................*....................................|........................................................*................................ + // str q13, [x0, #(5*(512/8))] // .....................................................*..................................|..........................................................*.............................. + // str q14, [x0, #(6*(512/8))] // ............................................................*...........................|.................................................................*....................... + // str q15, [x0, #(7*(512/8))] // ................................................................*.......................|.....................................................................*................... + // mul v12.8h, v8.8h, v29.8h // ..........................................................*.............................|...............................................................*......................... + // sqrdmulh v8.8h, v8.8h, v30.8h // ..............................................................*.........................|...................................................................*..................... + // mls v12.8h, v8.8h, v7.h[0] // ....................................................................*...................|.........................................................................*............... + // mul v13.8h, v9.8h, v29.8h // ...............................................................*........................|....................................................................*.................... + // sqrdmulh v9.8h, v9.8h, v30.8h // ...................................................................*....................|........................................................................*................ + // mls v13.8h, v9.8h, v7.h[0] // ......................................................................*.................|...........................................................................*............. + // mul v14.8h, v10.8h, v29.8h // .....................................................................*..................|..........................................................................*.............. + // sqrdmulh v10.8h, v10.8h, v30.8h // ........................................................................*...............|.............................................................................*........... + // mls v14.8h, v10.8h, v7.h[0] // .............................................................................*..........|..................................................................................*...... + // mul v15.8h, v11.8h, v29.8h // ............................................................................*...........|.................................................................................*....... + // sqrdmulh v11.8h, v11.8h, v30.8h // .........................................................................*..............|..............................................................................*.......... + // mls v15.8h, v11.8h, v7.h[0] // ..............................................................................*.........|...................................................................................*..... + // str q12, [x0], #(16) // .......................................................................*................|............................................................................*............ + // str q13, [x0, #(-16 + 1*(512/8))] // ..........................................................................*.............|...............................................................................*......... + // str q14, [x0, #(-16 + 2*(512/8))] // ...................................................................................*....|........................................................................................* + // str q15, [x0, #(-16 + 3*(512/8))] // ..................................................................................*.....|.......................................................................................*. + + sub count, count, #1 + cbnz count, layer123_start + sub v16.8H, v16.8H, v4.8H // *.............................................................................. + sqrdmulh v2.8H, v2.8H, v1.H[5] // ..*............................................................................ + // gap // ............................................................................... + add v26.8H, v17.8H, v25.8H // ......*........................................................................ + // gap // ............................................................................... + // gap // ............................................................................... + add v21.8H, v6.8H, v3.8H // .....*......................................................................... + sub v27.8H, v23.8H, v22.8H // .*............................................................................. + // gap // ............................................................................... + mul v25.8H, v16.8H, v0.H[6] // ....*.......................................................................... + add v23.8H, v23.8H, v22.8H // ...*........................................................................... + // gap // ............................................................................... + sub v31.8H, v28.8H, v26.8H // ..............*................................................................ + // gap // ............................................................................... + // gap // ............................................................................... + sqrdmulh v17.8H, v16.8H, v0.H[7] // .......*....................................................................... + add v26.8H, v28.8H, v26.8H // ...........*................................................................... + // gap // ............................................................................... + sub v5.8H, v23.8H, v21.8H // ........*...................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + add v23.8H, v23.8H, v21.8H // .........*..................................................................... + mls v15.8H, v2.8H, v7.H[0] // .............*................................................................. + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + sqdmulh v11.8H, v26.8H, v7.H[1] // ...........................*................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + sqrdmulh v20.8H, v27.8H, v1.H[3] // ............*.................................................................. + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + sqdmulh v3.8H, v23.8H, v7.H[1] // ............................*.................................................. + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mul v21.8H, v27.8H, v1.H[2] // ..........*.................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mls v21.8H, v20.8H, v7.H[0] // ................*.............................................................. + // gap // ............................................................................... + // gap // ............................................................................... + srshr v20.8H, v3.8H, #11 // ................................*.............................................. + // gap // ............................................................................... + // gap // ............................................................................... + mls v25.8H, v17.8H, v7.H[0] // ...............*............................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + srshr v11.8H, v11.8H, #11 // ..............................*................................................ + sqrdmulh v4.8H, v31.8H, v0.H[3] // ........................*...................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mls v26.8H, v11.8H, v7.H[0] // .................................*............................................. + // gap // ............................................................................... + // gap // ............................................................................... + add v16.8H, v25.8H, v13.8H // ...................*........................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mul v17.8H, v31.8H, v0.H[2] // .................*............................................................. + sub v2.8H, v25.8H, v13.8H // ..................*............................................................ + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mls v23.8H, v20.8H, v7.H[0] // ..................................*............................................ + sub v20.8H, v21.8H, v15.8H // .....................*......................................................... + // gap // ............................................................................... + add v21.8H, v21.8H, v15.8H // .......................*....................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mls v17.8H, v4.8H, v7.H[0] // .............................*................................................. + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mul v4.8H, v2.8H, v0.H[2] // ...............................*............................................... + sub v11.8H, v16.8H, v21.8H // .........................*..................................................... + // gap // ............................................................................... + add v16.8H, v16.8H, v21.8H // ..........................*.................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mul v13.8H, v5.8H, v0.H[4] // ....................*.......................................................... + sub v21.8H, v26.8H, v23.8H // .....................................*......................................... + // gap // ............................................................................... + add v23.8H, v26.8H, v23.8H // ......................................*........................................ + // gap // ............................................................................... + // gap // ............................................................................... + sqrdmulh v25.8H, v5.8H, v0.H[5] // ......................*........................................................ + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + sqrdmulh v2.8H, v2.8H, v0.H[3] // ...................................*........................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mul v26.8H, v20.8H, v0.H[4] // .......................................*....................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + sqrdmulh v20.8H, v20.8H, v0.H[5] // ..........................................*.................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mls v4.8H, v2.8H, v7.H[0] // ........................................*...................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mls v13.8H, v25.8H, v7.H[0] // ....................................*.......................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mls v26.8H, v20.8H, v7.H[0] // ..............................................*................................ + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + sqrdmulh v20.8H, v11.8H, v0.H[1] // ...............................................*............................... + // gap // ............................................................................... + // gap // ............................................................................... + sub v2.8H, v17.8H, v13.8H // .........................................*..................................... + // gap // ............................................................................... + // gap // ............................................................................... + mul v25.8H, v21.8H, v0.H[0] // ............................................*.................................. + add v17.8H, v17.8H, v13.8H // ...........................................*................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + sqrdmulh v21.8H, v21.8H, v0.H[1] // .............................................*................................. + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mul v11.8H, v11.8H, v0.H[0] // ................................................*.............................. + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mls v11.8H, v20.8H, v7.H[0] // ....................................................*.......................... + // gap // ............................................................................... + sub v13.8H, v4.8H, v26.8H // ..................................................*............................ + add v26.8H, v4.8H, v26.8H // ...................................................*........................... + // gap // ............................................................................... + // gap // ............................................................................... + mls v25.8H, v21.8H, v7.H[0] // .................................................*............................. + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + sqrdmulh v4.8H, v13.8H, v0.H[1] // ..........................................................*.................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mul v21.8H, v2.8H, v0.H[0] // .....................................................*......................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + sqrdmulh v2.8H, v2.8H, v0.H[1] // .......................................................*....................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mul v20.8H, v13.8H, v0.H[0] // .........................................................*..................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mls v20.8H, v4.8H, v7.H[0] // .............................................................*................. + str q25, [x0, #256] // ......................................................*........................ + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + str q11, [x0, #320] // ........................................................*...................... + mls v21.8H, v2.8H, v7.H[0] // ...........................................................*................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mul v2.8H, v23.8H, v29.8H // ............................................................*.................. + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + sqrdmulh v23.8H, v23.8H, v30.8H // ...............................................................*............... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + str q21, [x0, #384] // ..............................................................*................ + mul v21.8H, v16.8H, v29.8H // ................................................................*.............. + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + sqrdmulh v16.8H, v16.8H, v30.8H // ..................................................................*............ + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mls v2.8H, v23.8H, v7.H[0] // ...................................................................*........... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mul v23.8H, v17.8H, v29.8H // ....................................................................*.......... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + str q20, [x0, #448] // .................................................................*............. + mls v21.8H, v16.8H, v7.H[0] // .....................................................................*......... + // gap // ............................................................................... + str q2, [x0], #(16) // ......................................................................*........ + // gap // ............................................................................... + // gap // ............................................................................... + sqrdmulh v2.8H, v17.8H, v30.8H // .......................................................................*....... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + sqrdmulh v16.8H, v26.8H, v30.8H // ........................................................................*...... + str q21, [x0, #48] // .........................................................................*..... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mul v21.8H, v26.8H, v29.8H // ..........................................................................*.... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mls v23.8H, v2.8H, v7.H[0] // ...........................................................................*... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + mls v21.8H, v16.8H, v7.H[0] // ............................................................................*.. + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + str q23, [x0, #112] // ..............................................................................* + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + // gap // ............................................................................... + str q21, [x0, #176] // .............................................................................*. + // gap // ............................................................................... + // gap // ............................................................................... + + // original source code + // sub v16.8H, v16.8H, v4.8H // *.............................................................................. + // sub v4.8H, v23.8H, v22.8H // ....*.......................................................................... + // sqrdmulh v2.8H, v2.8H, v1.H[5] // .*............................................................................. + // add v23.8H, v23.8H, v22.8H // ......*........................................................................ + // mul v19.8H, v16.8H, v0.H[6] // .....*......................................................................... + // add v20.8H, v6.8H, v3.8H // ...*........................................................................... + // add v17.8H, v17.8H, v25.8H // ..*............................................................................ + // sqrdmulh v27.8H, v16.8H, v0.H[7] // ........*...................................................................... + // sub v25.8H, v23.8H, v20.8H // ..........*.................................................................... + // add v24.8H, v23.8H, v20.8H // ...........*................................................................... + // mul v16.8H, v4.8H, v1.H[2] // ................*.............................................................. + // add v20.8H, v28.8H, v17.8H // .........*..................................................................... + // sqrdmulh v21.8H, v4.8H, v1.H[3] // ..............*................................................................ + // mls v15.8H, v2.8H, v7.H[0] // ............*.................................................................. + // sub v2.8H, v28.8H, v17.8H // .......*....................................................................... + // mls v19.8H, v27.8H, v7.H[0] // ...................*........................................................... + // mls v16.8H, v21.8H, v7.H[0] // .................*............................................................. + // mul v4.8H, v2.8H, v0.H[2] // ........................*...................................................... + // sub v26.8H, v19.8H, v13.8H // .........................*..................................................... + // add v9.8H, v19.8H, v13.8H // .......................*....................................................... + // mul v18.8H, v25.8H, v0.H[4] // .................................*............................................. + // sub v10.8H, v16.8H, v15.8H // ...........................*................................................... + // sqrdmulh v31.8H, v25.8H, v0.H[5] // ....................................*.......................................... + // add v16.8H, v16.8H, v15.8H // ............................*.................................................. + // sqrdmulh v2.8H, v2.8H, v0.H[3] // .....................*......................................................... + // sub v25.8H, v9.8H, v16.8H // ...............................*............................................... + // add v16.8H, v9.8H, v16.8H // ................................*.............................................. + // sqdmulh v21.8H, v20.8H, v7.H[1] // .............*................................................................. + // sqdmulh v23.8H, v24.8H, v7.H[1] // ...............*............................................................... + // mls v4.8H, v2.8H, v7.H[0] // .............................*................................................. + // srshr v2.8H, v21.8H, #11 // ....................*.......................................................... + // mul v17.8H, v26.8H, v0.H[2] // ..............................*................................................ + // srshr v23.8H, v23.8H, #11 // ..................*............................................................ + // mls v20.8H, v2.8H, v7.H[0] // ......................*........................................................ + // mls v24.8H, v23.8H, v7.H[0] // ..........................*.................................................... + // sqrdmulh v22.8H, v26.8H, v0.H[3] // .....................................*......................................... + // mls v18.8H, v31.8H, v7.H[0] // .........................................*..................................... + // sub v2.8H, v20.8H, v24.8H // ..................................*............................................ + // add v5.8H, v20.8H, v24.8H // ...................................*........................................... + // mul v21.8H, v10.8H, v0.H[4] // ......................................*........................................ + // mls v17.8H, v22.8H, v7.H[0] // ........................................*...................................... + // sub v20.8H, v4.8H, v18.8H // ............................................*.................................. + // sqrdmulh v31.8H, v10.8H, v0.H[5] // .......................................*....................................... + // add v26.8H, v4.8H, v18.8H // ..............................................*................................ + // mul v11.8H, v2.8H, v0.H[0] // .............................................*................................. + // sqrdmulh v2.8H, v2.8H, v0.H[1] // ...............................................*............................... + // mls v21.8H, v31.8H, v7.H[0] // ..........................................*.................................... + // sqrdmulh v4.8H, v25.8H, v0.H[1] // ...........................................*................................... + // mul v25.8H, v25.8H, v0.H[0] // ................................................*.............................. + // mls v11.8H, v2.8H, v7.H[0] // ....................................................*.......................... + // sub v2.8H, v17.8H, v21.8H // ..................................................*............................ + // add v21.8H, v17.8H, v21.8H // ...................................................*........................... + // mls v25.8H, v4.8H, v7.H[0] // .................................................*............................. + // mul v17.8H, v20.8H, v0.H[0] // ......................................................*........................ + // str q11, [x0, #256] // ..........................................................*.................... + // sqrdmulh v10.8H, v20.8H, v0.H[1] // .......................................................*....................... + // str q25, [x0, #320] // ...........................................................*................... + // mul v4.8H, v2.8H, v0.H[0] // ........................................................*...................... + // sqrdmulh v2.8H, v2.8H, v0.H[1] // .....................................................*......................... + // mls v17.8H, v10.8H, v7.H[0] // ............................................................*.................. + // mul v20.8H, v5.8H, v29.8H // .............................................................*................. + // mls v4.8H, v2.8H, v7.H[0] // .........................................................*..................... + // str q17, [x0, #384] // ...............................................................*............... + // sqrdmulh v28.8H, v5.8H, v30.8H // ..............................................................*................ + // mul v2.8H, v16.8H, v29.8H // ................................................................*.............. + // str q4, [x0, #448] // ....................................................................*.......... + // sqrdmulh v16.8H, v16.8H, v30.8H // .................................................................*............. + // mls v20.8H, v28.8H, v7.H[0] // ..................................................................*............ + // mul v23.8H, v26.8H, v29.8H // ...................................................................*........... + // mls v2.8H, v16.8H, v7.H[0] // .....................................................................*......... + // str q20, [x0], #(16) // ......................................................................*........ + // sqrdmulh v26.8H, v26.8H, v30.8H // .......................................................................*....... + // sqrdmulh v16.8H, v21.8H, v30.8H // ........................................................................*...... + // str q2, [x0, #48] // .........................................................................*..... + // mul v21.8H, v21.8H, v29.8H // ..........................................................................*.... + // mls v23.8H, v26.8H, v7.H[0] // ...........................................................................*... + // mls v21.8H, v16.8H, v7.H[0] // ............................................................................*.. + // str q21, [x0, #176] // ..............................................................................* + // str q23, [x0, #112] // .............................................................................*. + + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s new file mode 100644 index 0000000..a8f3f08 --- /dev/null +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s @@ -0,0 +1,1922 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_manual_ld4_opt_m1_firestorm + .global _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: +_intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 + ldr q0, [x4, #64] // ....*............................................. + ldr q14, [x4, #16] // ........*......................................... + ldr q6, [x4, #80] // ...*.............................................. + ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [x1] // .*................................................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + ldr q9, [x4, #48] // *................................................. + ldr q2, [x4], #(6*16) // .....*............................................ + ldr q3, [x3], #16 // ...................................*.............. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + ldr q11, [x4, #-64] // ..*............................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v22.8H, v18.8H, v19.8H // .......*.......................................... + sub v15.8H, v18.8H, v19.8H // ......*........................................... + add v31.8H, v16.8H, v17.8H // .........*........................................ + sub v5.8H, v16.8H, v17.8H // ..........*....................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mul v18.8H, v15.8H, v0.8H // ...........*...................................... + sqrdmulh v21.8H, v15.8H, v6.8H // ..............*................................... + mul v27.8H, v5.8H, v11.8H // .............*.................................... + sqrdmulh v11.8H, v5.8H, v9.8H // ............*..................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v16.8H, v31.8H, v22.8H // ...............*.................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v0.8H, v31.8H, v22.8H // ....................*............................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v27.8H, v11.8H, v7.H[0] // ...................*.............................. + mls v18.8H, v21.8H, v7.H[0] // ..................*............................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mul v8.8H, v16.8H, v2.8H // .................*................................ + sqrdmulh v9.8H, v16.8H, v14.8H // ................*................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v25.8H, v27.8H, v18.8H // ......................*........................... + sub v28.8H, v27.8H, v18.8H // .....................*............................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v8.8H, v9.8H, v7.H[0] // .......................*.......................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn1 v22.4S, v0.4S, v25.4S // .........................*........................ + trn2 v20.4S, v0.4S, v25.4S // ........................*......................... + sqrdmulh v18.8H, v28.8H, v14.8H // ...........................*...................... + mul v29.8H, v28.8H, v2.8H // ..........................*....................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v29.8H, v18.8H, v7.H[0] // ............................*..................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn1 v4.4S, v8.4S, v29.4S // ..............................*................... + trn2 v29.4S, v8.4S, v29.4S // .............................*.................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn2 v8.2D, v22.2D, v4.2D // ................................*................. + trn1 v1.2D, v22.2D, v4.2D // ...............................*.................. + trn2 v12.2D, v20.2D, v29.2D // ..................................*............... + trn1 v19.2D, v20.2D, v29.2D // .................................*................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v6.8H, v8.8H, v12.8H // ......................................*........... + add v26.8H, v8.8H, v12.8H // .....................................*............ + sub v23.8H, v1.8H, v19.8H // ...............................................*.. + add v17.8H, v1.8H, v19.8H // ....................................*............. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mul v27.8H, v6.8H, v3.H[4] // .........................................*........ + sqrdmulh v5.8H, v6.8H, v3.H[5] // ..........................................*....... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqdmulh v24.8H, v26.8H, v7.H[1] // ........................................*......... + sqdmulh v8.8H, v17.8H, v7.H[1] // .......................................*.......... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + srshr v14.8H, v8.8H, #11 // ...........................................*...... + srshr v6.8H, v24.8H, #11 // ............................................*..... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v26.8H, v6.8H, v7.H[0] // ..............................................*... + mls v17.8H, v14.8H, v7.H[0] // .............................................*.... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v11.8H, v17.8H, v26.8H // ................................................*. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + str q11, [x1], #(64) // .................................................* + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + + // original source code + // ldr q22, [x4, #48] // ....*............................................. + // ld4 {v11.4S, v12.4S, v13.4S, v14.4S}, [x1] // ...*.............................................. + // ldr q6, [x4, #32] // .......*.......................................... + // ldr q25, [x4, #80] // ..*............................................... + // ldr q19, [x4, #64] // *................................................. + // ldr q15, [x4], #(6*16) // .....*............................................ + // sub v8.8H, v13.8H, v14.8H // .........*........................................ + // add v17.8H, v13.8H, v14.8H // ........*......................................... + // ldr q18, [x4, #-80] // .*................................................ + // add v16.8H, v11.8H, v12.8H // ..........*....................................... + // sub v1.8H, v11.8H, v12.8H // ...........*...................................... + // mul v21.8H, v8.8H, v19.8H // ............*..................................... + // sqrdmulh v0.8H, v1.8H, v22.8H // ...............*.................................. + // mul v13.8H, v1.8H, v6.8H // ..............*................................... + // sqrdmulh v23.8H, v8.8H, v25.8H // .............*.................................... + // sub v24.8H, v16.8H, v17.8H // ................*................................. + // sqrdmulh v6.8H, v24.8H, v18.8H // .....................*............................ + // mul v1.8H, v24.8H, v15.8H // ....................*............................. + // mls v21.8H, v23.8H, v7.H[0] // ...................*.............................. + // mls v13.8H, v0.8H, v7.H[0] // ..................*............................... + // add v0.8H, v16.8H, v17.8H // .................*................................ + // sub v9.8H, v13.8H, v21.8H // .......................*.......................... + // add v14.8H, v13.8H, v21.8H // ......................*........................... + // mls v1.8H, v6.8H, v7.H[0] // ........................*......................... + // trn2 v11.4S, v0.4S, v14.4S // ..........................*....................... + // trn1 v2.4S, v0.4S, v14.4S // .........................*........................ + // mul v25.8H, v9.8H, v15.8H // ............................*..................... + // sqrdmulh v23.8H, v9.8H, v18.8H // ...........................*...................... + // mls v25.8H, v23.8H, v7.H[0] // .............................*.................... + // trn2 v3.4S, v1.4S, v25.4S // ...............................*.................. + // trn1 v6.4S, v1.4S, v25.4S // ..............................*................... + // trn1 v4.2D, v2.2D, v6.2D // .................................*................ + // trn2 v13.2D, v2.2D, v6.2D // ................................*................. + // trn1 v30.2D, v11.2D, v3.2D // ...................................*.............. + // trn2 v25.2D, v11.2D, v3.2D // ..................................*............... + // ldr q3, [x3], #16 // ......*........................................... + // add v17.8H, v4.8H, v30.8H // .......................................*.......... + // add v26.8H, v13.8H, v25.8H // .....................................*............ + // sub v16.8H, v13.8H, v25.8H // ....................................*............. + // sqdmulh v23.8H, v17.8H, v7.H[1] // ...........................................*...... + // sqdmulh v20.8H, v26.8H, v7.H[1] // ..........................................*....... + // mul v27.8H, v16.8H, v3.H[4] // ........................................*......... + // sqrdmulh v5.8H, v16.8H, v3.H[5] // .........................................*........ + // srshr v23.8H, v23.8H, #11 // ............................................*..... + // srshr v16.8H, v20.8H, #11 // .............................................*.... + // mls v17.8H, v23.8H, v7.H[0] // ...............................................*.. + // mls v26.8H, v16.8H, v7.H[0] // ..............................................*... + // sub v23.8H, v4.8H, v30.8H // ......................................*........... + // add v2.8H, v17.8H, v26.8H // ................................................*. + // str q2, [x1], #(64) // .................................................* + + sub count, count, #1 +layer4567_start: + ldr q22, [x4, #48] // ....e............................................................. + ld4 {v11.4S, v12.4S, v13.4S, v14.4S}, [x1] // e................................................................. + mls v27.8H, v5.8H, v7.H[0] // .............................................*.................... + sqrdmulh v5.8H, v23.8H, v3.H[3] // .......................................*.......................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sub v2.8H, v17.8H, v26.8H // ....................................................*............. + ldr q6, [x4, #32] // ...e.............................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mul v20.8H, v23.8H, v3.H[2] // ......................................*........................... + ldr q25, [x4, #80] // ......e........................................................... + mul v31.8H, v2.8H, v3.H[0] // ......................................................*........... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + ldr q19, [x4, #64] // .....e............................................................ + sqrdmulh v9.8H, v2.8H, v3.H[1] // .......................................................*.......... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mls v20.8H, v5.8H, v7.H[0] // ........................................*......................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + ldr q15, [x4], #(6*16) // .e................................................................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sub v8.8H, v13.8H, v14.8H // ............e..................................................... + add v17.8H, v13.8H, v14.8H // .............e.................................................... + ldr q18, [x4, #-80] // ..e............................................................... + add v16.8H, v11.8H, v12.8H // ........e......................................................... + sub v1.8H, v11.8H, v12.8H // .......e.......................................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mls v31.8H, v9.8H, v7.H[0] // ........................................................*......... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sub v2.8H, v20.8H, v27.8H // .........................................................*........ + mul v21.8H, v8.8H, v19.8H // ..............e................................................... + sqrdmulh v0.8H, v1.8H, v22.8H // ..........e....................................................... + mul v13.8H, v1.8H, v6.8H // .........e........................................................ + sqrdmulh v23.8H, v8.8H, v25.8H // ...............e.................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sqrdmulh v29.8H, v2.8H, v3.H[1] // ............................................................*..... + mul v8.8H, v2.8H, v3.H[0] // ...........................................................*...... + sub v24.8H, v16.8H, v17.8H // .................e................................................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sqrdmulh v6.8H, v24.8H, v18.8H // ....................e............................................. + mul v1.8H, v24.8H, v15.8H // ...................e.............................................. + mls v21.8H, v23.8H, v7.H[0] // ................e................................................. + mls v13.8H, v0.8H, v7.H[0] // ...........e...................................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + add v0.8H, v16.8H, v17.8H // ..................e............................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mls v8.8H, v29.8H, v7.H[0] // .............................................................*.... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sub v9.8H, v13.8H, v21.8H // ......................e........................................... + add v14.8H, v13.8H, v21.8H // .......................e.......................................... + mls v1.8H, v6.8H, v7.H[0] // .....................e............................................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn2 v11.4S, v0.4S, v14.4S // ............................e..................................... + trn1 v2.4S, v0.4S, v14.4S // ...........................e...................................... + mul v25.8H, v9.8H, v15.8H // ........................e......................................... + sqrdmulh v23.8H, v9.8H, v18.8H // .........................e........................................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + add v0.8H, v20.8H, v27.8H // ..........................................................*....... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mls v25.8H, v23.8H, v7.H[0] // ..........................e....................................... + str q0, [x1, #-48] // ...............................................................*.. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + str q31, [x1, #-32] // ................................................................*. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + str q8, [x1, #-16] // .................................................................* + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn2 v3.4S, v1.4S, v25.4S // ..............................e................................... + trn1 v6.4S, v1.4S, v25.4S // .............................e.................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn1 v4.2D, v2.2D, v6.2D // .................................e................................ + trn2 v13.2D, v2.2D, v6.2D // ...............................e.................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn1 v30.2D, v11.2D, v3.2D // ..................................e............................... + trn2 v25.2D, v11.2D, v3.2D // ................................e................................. + ldr q3, [x3], #16 // ...................................e.............................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + add v17.8H, v4.8H, v30.8H // .....................................e............................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + add v26.8H, v13.8H, v25.8H // ..........................................e....................... + sub v16.8H, v13.8H, v25.8H // .........................................e........................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sqdmulh v23.8H, v17.8H, v7.H[1] // ..............................................e................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sqdmulh v20.8H, v26.8H, v7.H[1] // .................................................e................ + mul v27.8H, v16.8H, v3.H[4] // ...........................................e...................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sqrdmulh v5.8H, v16.8H, v3.H[5] // ............................................e..................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + srshr v23.8H, v23.8H, #11 // ...............................................e.................. + srshr v16.8H, v20.8H, #11 // ..................................................e............... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mls v17.8H, v23.8H, v7.H[0] // ................................................e................. + mls v26.8H, v16.8H, v7.H[0] // ...................................................e.............. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sub v23.8H, v4.8H, v30.8H // ....................................e............................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + add v2.8H, v17.8H, v26.8H // .....................................................e............ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + str q2, [x1], #(64) // ..............................................................e... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + + // original source code + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // .e................................................................|e........................................... + // ldr q0, [x4], #(6*16) // ............e.....................................................|...........e................................ + // ldr q4, [x4, #(-6*16 + 1*16)] // ...............e..................................................|..............e............................. + // ldr q1, [x4, #(-6*16 + 2*16)] // .....e............................................................|....e....................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // e.................................................................e............................................ + // ldr q2, [x4, #(-6*16 + 4*16)] // .........e........................................................|........e................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // .......e..........................................................|......e..................................... + // sub v24.8h, v8.8h, v9.8h // .................e................................................|................e........................... + // add v8.8h, v8.8h, v9.8h // ................e.................................................|...............e............................ + // mul v9.8h, v24.8h, v1.8h // ......................e...........................................|.....................e...................... + // sqrdmulh v24.8h, v24.8h, v5.8h // .....................e............................................|....................e....................... + // mls v9.8h, v24.8h, v7.h[0] // ..............................e...................................|.............................e.............. + // sub v24.8h, v10.8h, v11.8h // .............e....................................................|............e............................... + // add v10.8h, v10.8h, v11.8h // ..............e...................................................|.............e.............................. + // mul v11.8h, v24.8h, v2.8h // ....................e.............................................|...................e........................ + // sqrdmulh v24.8h, v24.8h, v6.8h // .......................e..........................................|......................e..................... + // mls v11.8h, v24.8h, v7.h[0] // .............................e....................................|............................e............... + // sub v24.8h, v8.8h, v10.8h // ..........................e.......................................|.........................e.................. + // add v8.8h, v8.8h, v10.8h // ...............................e..................................|..............................e............. + // mul v10.8h, v24.8h, v0.8h // ............................e.....................................|...........................e................ + // sqrdmulh v24.8h, v24.8h, v4.8h // ...........................e......................................|..........................e................. + // mls v10.8h, v24.8h, v7.h[0] // ...................................e..............................|..................................e......... + // sub v24.8h, v9.8h, v11.8h // .................................e................................|................................e........... + // add v9.8h, v9.8h, v11.8h // ..................................e...............................|.................................e.......... + // mul v11.8h, v24.8h, v0.8h // ......................................e...........................|.....................................e...... + // sqrdmulh v24.8h, v24.8h, v4.8h // .......................................e..........................|......................................e..... + // mls v11.8h, v24.8h, v7.h[0] // .........................................e........................|........................................e... + // trn1 v25.4s, v8.4s, v9.4s // .....................................e............................|....................................e....... + // trn2 v26.4s, v8.4s, v9.4s // ....................................e.............................|...................................e........ + // trn1 v27.4s, v10.4s, v11.4s // ..............................................e...................|............................................ + // trn2 v28.4s, v10.4s, v11.4s // .............................................e....................|............................................ + // trn2 v10.2d, v25.2d, v27.2d // ................................................e.................|............................................ + // trn2 v11.2d, v26.2d, v28.2d // ..................................................e...............|............................................ + // trn1 v8.2d, v25.2d, v27.2d // ...............................................e..................|............................................ + // trn1 v9.2d, v26.2d, v28.2d // .................................................e................|............................................ + // ldr q0, [x3], #16 // ...................................................e..............|............................................ + // sub v24.8h, v8.8h, v9.8h // ...............................................................e..|............................................ + // add v8.8h, v8.8h, v9.8h // ....................................................e.............|............................................ + // mul v9.8h, v24.8h, v0.h[2] // ......*...........................................................|.....*...................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...*..............................................................|..*......................................... + // mls v9.8h, v24.8h, v7.h[0] // ...........*......................................................|..........*................................. + // sub v24.8h, v10.8h, v11.8h // ......................................................e...........|............................................ + // add v10.8h, v10.8h, v11.8h // .....................................................e............|............................................ + // mul v11.8h, v24.8h, v0.h[4] // .........................................................e........|............................................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..........................................................e.......|............................................ + // mls v11.8h, v24.8h, v7.h[0] // ..*...............................................................|.*.......................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // .......................................................e..........|............................................ + // srshr v25.8h, v25.8h, #11 // ...........................................................e......|............................................ + // mls v8.8h, v25.8h, v7.h[0] // .............................................................e....|............................................ + // sqdmulh v25.8h, v10.8h, v7.h[1] // ........................................................e.........|............................................ + // srshr v25.8h, v25.8h, #11 // ............................................................e.....|............................................ + // mls v10.8h, v25.8h, v7.h[0] // ..............................................................e...|............................................ + // sub v24.8h, v8.8h, v10.8h // ....*.............................................................|...*........................................ + // add v8.8h, v8.8h, v10.8h // ................................................................e.|............................................ + // mul v10.8h, v24.8h, v0.h[0] // ........*.........................................................|.......*.................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........*.......................................................|.........*.................................. + // mls v10.8h, v24.8h, v7.h[0] // ..................*...............................................|.................*.......................... + // sub v24.8h, v9.8h, v11.8h // ...................*..............................................|..................*......................... + // add v9.8h, v9.8h, v11.8h // ........................................*.........................|.......................................*.... + // mul v11.8h, v24.8h, v0.h[0] // .........................*........................................|........................*................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................*.........................................|.......................*.................... + // mls v11.8h, v24.8h, v7.h[0] // ................................*.................................|...............................*............ + // str q8, [x1], #(64) // .................................................................e|............................................ + // str q9, [x1, #(-64 + 16*1)] // ..........................................*.......................|.........................................*.. + // str q10, [x1, #(-64 + 16*2)] // ...........................................*......................|..........................................*. + // str q11, [x1, #(-64 + 16*3)] // ............................................*.....................|...........................................* + + sub count, count, #1 + cbnz count, layer4567_start + mul v20.8H, v23.8H, v3.H[2] // ...*............ + // gap // ................ + // gap // ................ + sqrdmulh v10.8H, v23.8H, v3.H[3] // .*.............. + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + mls v27.8H, v5.8H, v7.H[0] // *............... + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + sub v23.8H, v17.8H, v26.8H // ..*............. + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + mls v20.8H, v10.8H, v7.H[0] // ......*......... + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + mul v15.8H, v23.8H, v3.H[0] // ....*........... + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + add v13.8H, v20.8H, v27.8H // ............*... + sub v20.8H, v20.8H, v27.8H // ........*....... + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + sqrdmulh v2.8H, v23.8H, v3.H[1] // .....*.......... + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + mul v29.8H, v20.8H, v3.H[0] // ..........*..... + sqrdmulh v22.8H, v20.8H, v3.H[1] // .........*...... + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + mls v15.8H, v2.8H, v7.H[0] // .......*........ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + str q13, [x1, #-48] // .............*.. + mls v29.8H, v22.8H, v7.H[0] // ...........*.... + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + str q15, [x1, #-32] // ..............*. + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + str q29, [x1, #-16] // ...............* + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + // gap // ................ + + // original source code + // mls v27.8H, v5.8H, v7.H[0] // ..*............. + // sqrdmulh v5.8H, v23.8H, v3.H[3] // .*.............. + // sub v2.8H, v17.8H, v26.8H // ...*............ + // mul v20.8H, v23.8H, v3.H[2] // *............... + // mul v31.8H, v2.8H, v3.H[0] // .....*.......... + // sqrdmulh v9.8H, v2.8H, v3.H[1] // ........*....... + // mls v20.8H, v5.8H, v7.H[0] // ....*........... + // mls v31.8H, v9.8H, v7.H[0] // ...........*.... + // sub v2.8H, v20.8H, v27.8H // .......*........ + // sqrdmulh v29.8H, v2.8H, v3.H[1] // ..........*..... + // mul v8.8H, v2.8H, v3.H[0] // .........*...... + // mls v8.8H, v29.8H, v7.H[0] // .............*.. + // add v0.8H, v20.8H, v27.8H // ......*......... + // str q0, [x1, #-48] // ............*... + // str q31, [x1, #-32] // ..............*. + // str q8, [x1, #-16] // ...............* + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + ldr q2, [x0, #320] // .*........................................ + ldr q16, [x0, #256] // ..*....................................... + ldr q23, [x0, #384] // *......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + ldr q21, [x0, #448] // .....*.................................... + ldr q26, [x0, #192] // ...*...................................... + ldr q20, [x0, #128] // ....*..................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + ldr q17, [x0, #0] // ......*................................... + ldr q4, [x0, #64] // .......*.................................. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sub v25.8H, v16.8H, v2.8H // .........*................................ + add v2.8H, v16.8H, v2.8H // ........*................................. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sub v16.8H, v23.8H, v21.8H // ..........*............................... + add v23.8H, v23.8H, v21.8H // ............*............................. + sub v21.8H, v20.8H, v26.8H // ...........*.............................. + add v26.8H, v20.8H, v26.8H // ...........................*.............. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sub v20.8H, v17.8H, v4.8H // .............*............................ + add v17.8H, v17.8H, v4.8H // ..............*........................... + sqrdmulh v4.8H, v25.8H, v1.H[3] // ...............*.......................... + mul v13.8H, v25.8H, v1.H[2] // ...................*...................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mul v3.8H, v16.8H, v1.H[4] // .................*........................ + sqrdmulh v16.8H, v16.8H, v1.H[5] // ..................*....................... + sqrdmulh v11.8H, v21.8H, v1.H[1] // ................*......................... + mul v21.8H, v21.8H, v1.H[0] // .......................*.................. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sub v10.8H, v2.8H, v23.8H // ............................*............. + add v25.8H, v2.8H, v23.8H // .............................*............ + mul v19.8H, v20.8H, v0.H[6] // ....................*..................... + sqrdmulh v23.8H, v20.8H, v0.H[7] // .....................*.................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + add v28.8H, v17.8H, v26.8H // ...............................*.......... + sub v9.8H, v17.8H, v26.8H // ....................................*..... + mls v13.8H, v4.8H, v7.H[0] // ......................*................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v3.8H, v16.8H, v7.H[0] // ........................*................. + mls v21.8H, v11.8H, v7.H[0] // ..........................*............... + sqdmulh v2.8H, v25.8H, v7.H[1] // ...................................*...... + sqrdmulh v22.8H, v10.8H, v0.H[5] // .....................................*.... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mul v11.8H, v10.8H, v0.H[4] // .........................................* + mls v19.8H, v23.8H, v7.H[0] // .........................*................ + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + add v23.8H, v13.8H, v3.8H // ..............................*........... + sub v26.8H, v13.8H, v3.8H // .................................*........ + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sub v16.8H, v19.8H, v21.8H // ..................................*....... + add v12.8H, v19.8H, v21.8H // ................................*......... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sqrdmulh v18.8H, v16.8H, v0.H[3] // ......................................*... + mul v3.8H, v16.8H, v0.H[2] // .......................................*.. + add v20.8H, v12.8H, v23.8H // ........................................*. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + + // original source code + // ldr q6, [x0, #384] // ..*....................................... + // ldr q9, [x0, #320] // *......................................... + // ldr q21, [x0, #256] // .*........................................ + // ldr q8, [x0, #192] // ....*..................................... + // ldr q17, [x0, #128] // .....*.................................... + // ldr q13, [x0, #448] // ...*...................................... + // ldr q5, [x0, #0] // ......*................................... + // ldr q22, [x0, #64] // .......*.................................. + // add v24.8H, v21.8H, v9.8H // .........*................................ + // sub v26.8H, v21.8H, v9.8H // ........*................................. + // sub v9.8H, v6.8H, v13.8H // ..........*............................... + // sub v14.8H, v17.8H, v8.8H // ............*............................. + // add v27.8H, v6.8H, v13.8H // ...........*.............................. + // sub v31.8H, v5.8H, v22.8H // ..............*........................... + // add v6.8H, v5.8H, v22.8H // ...............*.......................... + // sqrdmulh v12.8H, v26.8H, v1.H[3] // ................*......................... + // sqrdmulh v13.8H, v14.8H, v1.H[1] // ....................*..................... + // mul v5.8H, v9.8H, v1.H[4] // ..................*....................... + // sqrdmulh v21.8H, v9.8H, v1.H[5] // ...................*...................... + // mul v9.8H, v26.8H, v1.H[2] // .................*........................ + // mul v2.8H, v31.8H, v0.H[6] // ........................*................. + // sqrdmulh v26.8H, v31.8H, v0.H[7] // .........................*................ + // mls v9.8H, v12.8H, v7.H[0] // ............................*............. + // mul v14.8H, v14.8H, v1.H[0] // .....................*.................... + // mls v5.8H, v21.8H, v7.H[0] // .............................*............ + // mls v2.8H, v26.8H, v7.H[0] // ..................................*....... + // mls v14.8H, v13.8H, v7.H[0] // ..............................*........... + // add v8.8H, v17.8H, v8.8H // .............*............................ + // sub v13.8H, v24.8H, v27.8H // ......................*................... + // add v25.8H, v24.8H, v27.8H // .......................*.................. + // add v23.8H, v9.8H, v5.8H // ...................................*...... + // add v28.8H, v6.8H, v8.8H // ..........................*............... + // add v12.8H, v2.8H, v14.8H // ......................................*... + // sub v26.8H, v9.8H, v5.8H // ....................................*..... + // sub v19.8H, v2.8H, v14.8H // .....................................*.... + // sqdmulh v2.8H, v25.8H, v7.H[1] // ...............................*.......... + // sub v9.8H, v6.8H, v8.8H // ...........................*.............. + // sqrdmulh v22.8H, v13.8H, v0.H[5] // ................................*......... + // sqrdmulh v18.8H, v19.8H, v0.H[3] // .......................................*.. + // mul v3.8H, v19.8H, v0.H[2] // ........................................*. + // add v20.8H, v12.8H, v23.8H // .........................................* + // mul v11.8H, v13.8H, v0.H[4] // .................................*........ + + sub count, count, #1 +layer123_start: + sub v27.8H, v12.8H, v23.8H // ...........................................................*.................................. + sqrdmulh v16.8H, v9.8H, v0.H[3] // ...............................*.............................................................. + sqdmulh v24.8H, v28.8H, v7.H[1] // ................................................*............................................. + mul v4.8H, v9.8H, v0.H[2] // ..............................*............................................................... + ldr q6, [x0, #400] // ......e....................................................................................... + ldr q9, [x0, #336] // .....e........................................................................................ + // gap // .............................................................................................. + ldr q21, [x0, #272] // ....e......................................................................................... + mul v10.8H, v20.8H, v29.8H // .................................................................................*............ + sqrdmulh v23.8H, v20.8H, v30.8H // ..................................................................................*........... + mul v20.8H, v26.8H, v0.H[4] // .............................................*................................................ + ldr q8, [x0, #208] // ...e.......................................................................................... + ldr q17, [x0, #144] // ..e........................................................................................... + ldr q13, [x0, #464] // .......e...................................................................................... + // gap // .............................................................................................. + srshr v12.8H, v2.8H, #11 // ....................................................*......................................... + sqrdmulh v19.8H, v27.8H, v0.H[1] // ..............................................................*............................... + ldr q5, [x0, #16] // e............................................................................................. + sqrdmulh v31.8H, v26.8H, v0.H[5] // ..............................................*............................................... + mls v11.8H, v22.8H, v7.H[0] // ..........................................*................................................... + ldr q22, [x0, #80] // .e............................................................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v15.8H, v27.8H, v0.H[0] // .............................................................*................................ + mls v4.8H, v16.8H, v7.H[0] // ................................*............................................................. + mls v3.8H, v18.8H, v7.H[0] // .....................................*........................................................ + // gap // .............................................................................................. + srshr v16.8H, v24.8H, #11 // .................................................*............................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + add v24.8H, v21.8H, v9.8H // ...................e.......................................................................... + sub v26.8H, v21.8H, v9.8H // ..................e........................................................................... + mls v25.8H, v12.8H, v7.H[0] // .....................................................*........................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v10.8H, v23.8H, v7.H[0] // ...................................................................................*.......... + sub v9.8H, v6.8H, v13.8H // .......................e...................................................................... + mls v20.8H, v31.8H, v7.H[0] // ...............................................*.............................................. + sub v14.8H, v17.8H, v8.8H // .............e................................................................................ + add v27.8H, v6.8H, v13.8H // ........................e..................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + add v18.8H, v4.8H, v11.8H // .................................................................*............................ + sub v31.8H, v5.8H, v22.8H // ........e..................................................................................... + add v6.8H, v5.8H, v22.8H // .........e.................................................................................... + sqrdmulh v12.8H, v26.8H, v1.H[3] // .....................e........................................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v13.8H, v14.8H, v1.H[1] // ................e............................................................................. + mul v5.8H, v9.8H, v1.H[4] // .........................e.................................................................... + str q10, [x0, #64] // ...........................................................................................*.. + sqrdmulh v21.8H, v9.8H, v1.H[5] // ..........................e................................................................... + mul v9.8H, v26.8H, v1.H[2] // ....................e......................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v22.8H, v3.8H, v20.8H // .....................................................................*........................ + add v20.8H, v3.8H, v20.8H // ......................................................................*....................... + sub v11.8H, v4.8H, v11.8H // ................................................................*............................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v28.8H, v16.8H, v7.H[0] // ..................................................*........................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v2.8H, v31.8H, v0.H[6] // ..........e................................................................................... + mul v23.8H, v18.8H, v29.8H // ....................................................................................*......... + sqrdmulh v16.8H, v18.8H, v30.8H // .....................................................................................*........ + sqrdmulh v26.8H, v31.8H, v0.H[7] // ...........e.................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v10.8H, v20.8H, v30.8H // ........................................................................................*..... + mul v3.8H, v20.8H, v29.8H // .......................................................................................*...... + sqrdmulh v4.8H, v11.8H, v0.H[1] // ...................................................................*.......................... + mls v9.8H, v12.8H, v7.H[0] // ......................e....................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v18.8H, v22.8H, v0.H[1] // ........................................................................*..................... + add v12.8H, v28.8H, v25.8H // .......................................................*...................................... + sub v25.8H, v28.8H, v25.8H // ......................................................*....................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v14.8H, v14.8H, v1.H[0] // ...............e.............................................................................. + mls v5.8H, v21.8H, v7.H[0] // ...........................e.................................................................. + // gap // .............................................................................................. + mul v20.8H, v22.8H, v0.H[0] // .......................................................................*...................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v22.8H, v11.8H, v0.H[0] // ..................................................................*........................... + mls v23.8H, v16.8H, v7.H[0] // ......................................................................................*....... + mul v11.8H, v12.8H, v29.8H // ..............................................................................*............... + sqrdmulh v31.8H, v25.8H, v0.H[1] // .........................................................*.................................... + mul v16.8H, v25.8H, v0.H[0] // ........................................................*..................................... + sqrdmulh v12.8H, v12.8H, v30.8H // ...............................................................................*.............. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v15.8H, v19.8H, v7.H[0] // ...............................................................*.............................. + mls v3.8H, v10.8H, v7.H[0] // .........................................................................................*.... + mls v2.8H, v26.8H, v7.H[0] // ............e................................................................................. + mls v14.8H, v13.8H, v7.H[0] // .................e............................................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v22.8H, v4.8H, v7.H[0] // ....................................................................*......................... + add v8.8H, v17.8H, v8.8H // ..............e............................................................................... + str q23, [x0, #128] // ............................................................................................*. + mls v20.8H, v18.8H, v7.H[0] // .........................................................................*.................... + sub v13.8H, v24.8H, v27.8H // ......................................e....................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v16.8H, v31.8H, v7.H[0] // ..........................................................*................................... + mls v11.8H, v12.8H, v7.H[0] // ................................................................................*............. + add v25.8H, v24.8H, v27.8H // .......................................e...................................................... + add v23.8H, v9.8H, v5.8H // ............................................e................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + add v28.8H, v6.8H, v8.8H // .............................e................................................................ + str q3, [x0, #192] // .............................................................................................* + add v12.8H, v2.8H, v14.8H // ..................................e........................................................... + sub v26.8H, v9.8H, v5.8H // ...........................................e.................................................. + str q15, [x0, #320] // ...........................................................................*.................. + sub v19.8H, v2.8H, v14.8H // .................................e............................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + str q20, [x0, #448] // .............................................................................*................ + sqdmulh v2.8H, v25.8H, v7.H[1] // ...................................................e.......................................... + sub v9.8H, v6.8H, v8.8H // ............................e................................................................. + str q22, [x0, #384] // ............................................................................*................. + sqrdmulh v22.8H, v13.8H, v0.H[5] // .........................................e.................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v18.8H, v19.8H, v0.H[3] // ....................................e......................................................... + mul v3.8H, v19.8H, v0.H[2] // ...................................e.......................................................... + add v20.8H, v12.8H, v23.8H // ............................................................e................................. + str q16, [x0, #256] // ..........................................................................*................... + str q11, [x0], #(16) // ..........................................................................................*... + mul v11.8H, v13.8H, v0.H[4] // ........................................e..................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + + // original source code + // ldr q8, [x0, #0] // ...........e..............................................................................|..............e............................................................................. + // ldr q9, [x0, #(1*(512/8))] // ..............e...........................................................................|.................e.......................................................................... + // ldr q10, [x0, #(2*(512/8))] // .......e..................................................................................|..........e................................................................................. + // ldr q11, [x0, #(3*(512/8))] // ......e...................................................................................|.........e.................................................................................. + // ldr q12, [x0, #(4*(512/8))] // ..e.......................................................................................|.....e...................................................................................... + // ldr q13, [x0, #(5*(512/8))] // .e........................................................................................|....e....................................................................................... + // ldr q14, [x0, #(6*(512/8))] // e.........................................................................................|...e........................................................................................ + // ldr q15, [x0, #(7*(512/8))] // ........e.................................................................................|...........e................................................................................ + // sub v24.8h, v8.8h, v9.8h // ............................e.............................................................|...............................e............................................................ + // add v8.8h, v8.8h, v9.8h // .............................e............................................................|................................e........................................................... + // mul v9.8h, v24.8h, v0.h[6] // ........................................e.................................................|...........................................e................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[7] // ...........................................e..............................................|..............................................e............................................. + // mls v9.8h, v24.8h, v7.h[0] // ..............................................................e...........................|.................................................................e.......................... + // sub v24.8h, v10.8h, v11.8h // .........................e................................................................|............................e............................................................... + // add v10.8h, v10.8h, v11.8h // .................................................................e........................|....................................................................e....................... + // mul v11.8h, v24.8h, v1.h[0] // ...................................................e......................................|......................................................e..................................... + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ...............................e..........................................................|..................................e......................................................... + // mls v11.8h, v24.8h, v7.h[0] // ...............................................................e..........................|..................................................................e......................... + // sub v24.8h, v12.8h, v13.8h // ....................e.....................................................................|.......................e.................................................................... + // add v12.8h, v12.8h, v13.8h // ...................e......................................................................|......................e..................................................................... + // mul v13.8h, v24.8h, v1.h[2] // ...................................e......................................................|......................................e..................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // ..............................e...........................................................|.................................e.......................................................... + // mls v13.8h, v24.8h, v7.h[0] // ...............................................e..........................................|..................................................e......................................... + // sub v24.8h, v14.8h, v15.8h // .......................e..................................................................|..........................e................................................................. + // add v14.8h, v14.8h, v15.8h // ..........................e...............................................................|.............................e.............................................................. + // mul v15.8h, v24.8h, v1.h[4] // ................................e.........................................................|...................................e........................................................ + // sqrdmulh v24.8h, v24.8h, v1.h[5] // ..................................e.......................................................|.....................................e...................................................... + // mls v15.8h, v24.8h, v7.h[0] // ....................................................e.....................................|.......................................................e.................................... + // sub v24.8h, v8.8h, v10.8h // .................................................................................e........|....................................................................................e....... + // add v8.8h, v8.8h, v10.8h // .........................................................................e................|............................................................................e............... + // mul v10.8h, v24.8h, v0.h[2] // ..........................................................................................|..*......................................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..........................................................................................|*........................................................................................... + // mls v10.8h, v24.8h, v7.h[0] // ................*.........................................................................|...................*........................................................................ + // sub v24.8h, v9.8h, v11.8h // ..............................................................................e...........|.................................................................................e.......... + // add v9.8h, v9.8h, v11.8h // ...........................................................................e..............|..............................................................................e............. + // mul v11.8h, v24.8h, v0.h[2] // .....................................................................................e....|........................................................................................e... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ....................................................................................e.....|.......................................................................................e.... + // mls v11.8h, v24.8h, v7.h[0] // .................*........................................................................|....................*....................................................................... + // sub v24.8h, v12.8h, v14.8h // ....................................................................e.....................|.......................................................................e.................... + // add v12.8h, v12.8h, v14.8h // .......................................................................e..................|..........................................................................e................. + // mul v14.8h, v24.8h, v0.h[4] // .........................................................................................e|............................................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...................................................................................e......|......................................................................................e..... + // mls v14.8h, v24.8h, v7.h[0] // .............*............................................................................|................*........................................................................... + // sub v24.8h, v13.8h, v15.8h // ............................................................................e.............|...............................................................................e............ + // add v13.8h, v13.8h, v15.8h // ........................................................................e.................|...........................................................................e................ + // mul v15.8h, v24.8h, v0.h[4] // .....*....................................................................................|........*................................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ............*.............................................................................|...............*............................................................................ + // mls v15.8h, v24.8h, v7.h[0] // ........................*.................................................................|...........................*................................................................ + // sqdmulh v25.8h, v8.8h, v7.h[1] // ..........................................................................................|.*.......................................................................................... + // srshr v25.8h, v25.8h, #11 // ..................*.......................................................................|.....................*...................................................................... + // mls v8.8h, v25.8h, v7.h[0] // .......................................*..................................................|..........................................*................................................. + // sqdmulh v25.8h, v12.8h, v7.h[1] // ................................................................................e.........|...................................................................................e........ + // srshr v25.8h, v25.8h, #11 // .........*................................................................................|............*............................................................................... + // mls v12.8h, v25.8h, v7.h[0] // .....................*....................................................................|........................*................................................................... + // sub v24.8h, v8.8h, v12.8h // ..................................................*.......................................|.....................................................*...................................... + // add v8.8h, v8.8h, v12.8h // .................................................*........................................|....................................................*....................................... + // mul v12.8h, v24.8h, v0.h[0] // ..........................................................*...............................|.............................................................*.............................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .........................................................*................................|............................................................*............................... + // mls v12.8h, v24.8h, v7.h[0] // .....................................................................*....................|........................................................................*................... + // sub v24.8h, v9.8h, v13.8h // ..........................................................................................*............................................................................................ + // add v9.8h, v9.8h, v13.8h // ......................................................................................e...|.........................................................................................e.. + // mul v13.8h, v24.8h, v0.h[0] // ...............*..........................................................................|..................*......................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........*...............................................................................|.............*.............................................................................. + // mls v13.8h, v24.8h, v7.h[0] // ............................................................*.............................|...............................................................*............................ + // sub v24.8h, v10.8h, v14.8h // ......................................*...................................................|.........................................*.................................................. + // add v10.8h, v10.8h, v14.8h // ...........................*..............................................................|..............................*............................................................. + // mul v14.8h, v24.8h, v0.h[0] // ......................................................*...................................|.........................................................*.................................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................*...........................................|.................................................*.......................................... + // mls v14.8h, v24.8h, v7.h[0] // ................................................................*.........................|...................................................................*........................ + // sub v24.8h, v11.8h, v15.8h // ....................................*.....................................................|.......................................*.................................................... + // add v11.8h, v11.8h, v15.8h // .....................................*....................................................|........................................*................................................... + // mul v15.8h, v24.8h, v0.h[0] // .....................................................*....................................|........................................................*................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ................................................*.........................................|...................................................*........................................ + // mls v15.8h, v24.8h, v7.h[0] // ...................................................................*......................|......................................................................*..................... + // str q12, [x0, #(4*(512/8))] // .......................................................................................*..|..........................................................................................*. + // str q13, [x0, #(5*(512/8))] // .............................................................................*............|................................................................................*........... + // str q14, [x0, #(6*(512/8))] // ..................................................................................*.......|.....................................................................................*...... + // str q15, [x0, #(7*(512/8))] // ...............................................................................*..........|..................................................................................*......... + // mul v12.8h, v8.8h, v29.8h // ........................................................*.................................|...........................................................*................................ + // sqrdmulh v8.8h, v8.8h, v30.8h // ...........................................................*..............................|..............................................................*............................. + // mls v12.8h, v8.8h, v7.h[0] // ......................................................................*...................|.........................................................................*.................. + // mul v13.8h, v9.8h, v29.8h // ...*......................................................................................|......*..................................................................................... + // sqrdmulh v9.8h, v9.8h, v30.8h // ....*.....................................................................................|.......*.................................................................................... + // mls v13.8h, v9.8h, v7.h[0] // ......................*...................................................................|.........................*.................................................................. + // mul v14.8h, v10.8h, v29.8h // .........................................*................................................|............................................*............................................... + // sqrdmulh v10.8h, v10.8h, v30.8h // ..........................................*...............................................|.............................................*.............................................. + // mls v14.8h, v10.8h, v7.h[0] // .......................................................*..................................|..........................................................*................................. + // mul v15.8h, v11.8h, v29.8h // .............................................*............................................|................................................*........................................... + // sqrdmulh v11.8h, v11.8h, v30.8h // ............................................*.............................................|...............................................*............................................ + // mls v15.8h, v11.8h, v7.h[0] // .............................................................*............................|................................................................*........................... + // str q12, [x0], #(16) // ........................................................................................*.|...........................................................................................* + // str q13, [x0, #(-16 + 1*(512/8))] // .................................*........................................................|....................................*....................................................... + // str q14, [x0, #(-16 + 2*(512/8))] // ..................................................................*.......................|.....................................................................*...................... + // str q15, [x0, #(-16 + 3*(512/8))] // ..........................................................................*...............|.............................................................................*.............. + + sub count, count, #1 + cbnz count, layer123_start + mul v24.8H, v20.8H, v29.8H // ....*............................................... + sqrdmulh v19.8H, v26.8H, v0.H[5] // .........*.......................................... + mul v27.8H, v26.8H, v0.H[4] // ......*............................................. + sqdmulh v31.8H, v28.8H, v7.H[1] // ..*................................................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v17.8H, v20.8H, v30.8H // .....*.............................................. + srshr v20.8H, v2.8H, #11 // .......*............................................ + sqrdmulh v16.8H, v9.8H, v0.H[3] // .*.................................................. + mul v8.8H, v9.8H, v0.H[2] // ...*................................................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v3.8H, v18.8H, v7.H[0] // .............*...................................... + sub v23.8H, v12.8H, v23.8H // *................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + srshr v9.8H, v31.8H, #11 // ..............*..................................... + mls v11.8H, v22.8H, v7.H[0] // ..........*......................................... + mls v27.8H, v19.8H, v7.H[0] // .................*.................................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v25.8H, v20.8H, v7.H[0] // ...............*.................................... + mls v8.8H, v16.8H, v7.H[0] // ............*....................................... + sqrdmulh v14.8H, v23.8H, v0.H[1] // ........*........................................... + mul v10.8H, v23.8H, v0.H[0] // ...........*........................................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v24.8H, v17.8H, v7.H[0] // ................*................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v23.8H, v3.8H, v27.8H // ....................*............................... + add v31.8H, v3.8H, v27.8H // .....................*.............................. + mls v28.8H, v9.8H, v7.H[0] // .......................*............................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v10.8H, v14.8H, v7.H[0] // .......................................*............ + sub v2.8H, v8.8H, v11.8H // ......................*............................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v13.8H, v31.8H, v30.8H // ..........................*......................... + mul v20.8H, v31.8H, v29.8H // ...........................*........................ + add v31.8H, v8.8H, v11.8H // ..................*................................. + str q24, [x0, #64] // ...................*................................ + mul v16.8H, v23.8H, v0.H[0] // ................................*................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v12.8H, v28.8H, v25.8H // ...............................*.................... + add v21.8H, v28.8H, v25.8H // ..............................*..................... + sqrdmulh v3.8H, v2.8H, v0.H[1] // ............................*....................... + sqrdmulh v11.8H, v23.8H, v0.H[1] // .............................*...................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v24.8H, v31.8H, v30.8H // .........................*.......................... + mul v17.8H, v31.8H, v29.8H // ........................*........................... + str q10, [x0, #320] // ...............................................*.... + mul v10.8H, v2.8H, v0.H[0] // .................................*.................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mul v5.8H, v21.8H, v29.8H // ...................................*................ + sqrdmulh v2.8H, v21.8H, v30.8H // ......................................*............. + sqrdmulh v23.8H, v12.8H, v0.H[1] // ....................................*............... + mul v12.8H, v12.8H, v0.H[0] // .....................................*.............. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v20.8H, v13.8H, v7.H[0] // ........................................*........... + mls v16.8H, v11.8H, v7.H[0] // ...........................................*........ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v10.8H, v3.8H, v7.H[0] // .........................................*.......... + mls v17.8H, v24.8H, v7.H[0] // ..................................*................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v5.8H, v2.8H, v7.H[0] // .............................................*...... + mls v12.8H, v23.8H, v7.H[0] // ............................................*....... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + str q20, [x0, #192] // ..............................................*..... + str q16, [x0, #448] // ................................................*... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + str q17, [x0, #128] // ..........................................*......... + str q10, [x0, #384] // .................................................*.. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + str q5, [x0], #(16) // ...................................................* + str q12, [x0, #240] // ..................................................*. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + + // original source code + // sub v27.8H, v12.8H, v23.8H // .........*.......................................... + // sqrdmulh v16.8H, v9.8H, v0.H[3] // ......*............................................. + // sqdmulh v24.8H, v28.8H, v7.H[1] // ...*................................................ + // mul v4.8H, v9.8H, v0.H[2] // .......*............................................ + // mul v10.8H, v20.8H, v29.8H // *................................................... + // sqrdmulh v23.8H, v20.8H, v30.8H // ....*............................................... + // mul v20.8H, v26.8H, v0.H[4] // ..*................................................. + // srshr v12.8H, v2.8H, #11 // .....*.............................................. + // sqrdmulh v19.8H, v27.8H, v0.H[1] // ...............*.................................... + // sqrdmulh v31.8H, v26.8H, v0.H[5] // .*.................................................. + // mls v11.8H, v22.8H, v7.H[0] // ...........*........................................ + // mul v15.8H, v27.8H, v0.H[0] // ................*................................... + // mls v4.8H, v16.8H, v7.H[0] // ..............*..................................... + // mls v3.8H, v18.8H, v7.H[0] // ........*........................................... + // srshr v16.8H, v24.8H, #11 // ..........*......................................... + // mls v25.8H, v12.8H, v7.H[0] // .............*...................................... + // mls v10.8H, v23.8H, v7.H[0] // .................*.................................. + // mls v20.8H, v31.8H, v7.H[0] // ............*....................................... + // add v18.8H, v4.8H, v11.8H // .........................*.......................... + // str q10, [x0, #64] // ..........................*......................... + // sub v22.8H, v3.8H, v20.8H // ..................*................................. + // add v20.8H, v3.8H, v20.8H // ...................*................................ + // sub v11.8H, v4.8H, v11.8H // ......................*............................. + // mls v28.8H, v16.8H, v7.H[0] // ....................*............................... + // mul v23.8H, v18.8H, v29.8H // .................................*.................. + // sqrdmulh v16.8H, v18.8H, v30.8H // ................................*................... + // sqrdmulh v10.8H, v20.8H, v30.8H // .......................*............................ + // mul v3.8H, v20.8H, v29.8H // ........................*........................... + // sqrdmulh v4.8H, v11.8H, v0.H[1] // ..............................*..................... + // sqrdmulh v18.8H, v22.8H, v0.H[1] // ...............................*.................... + // add v12.8H, v28.8H, v25.8H // .............................*...................... + // sub v25.8H, v28.8H, v25.8H // ............................*....................... + // mul v20.8H, v22.8H, v0.H[0] // ...........................*........................ + // mul v22.8H, v11.8H, v0.H[0] // ...................................*................ + // mls v23.8H, v16.8H, v7.H[0] // ...........................................*........ + // mul v11.8H, v12.8H, v29.8H // ....................................*............... + // sqrdmulh v31.8H, v25.8H, v0.H[1] // ......................................*............. + // mul v16.8H, v25.8H, v0.H[0] // .......................................*............ + // sqrdmulh v12.8H, v12.8H, v30.8H // .....................................*.............. + // mls v15.8H, v19.8H, v7.H[0] // .....................*.............................. + // mls v3.8H, v10.8H, v7.H[0] // ........................................*........... + // mls v22.8H, v4.8H, v7.H[0] // ..........................................*......... + // str q23, [x0, #128] // ................................................*... + // mls v20.8H, v18.8H, v7.H[0] // .........................................*.......... + // mls v16.8H, v31.8H, v7.H[0] // .............................................*...... + // mls v11.8H, v12.8H, v7.H[0] // ............................................*....... + // str q3, [x0, #192] // ..............................................*..... + // str q15, [x0, #320] // ..................................*................. + // str q20, [x0, #448] // ...............................................*.... + // str q22, [x0, #384] // .................................................*.. + // str q16, [x0, #256] // ...................................................* + // str q11, [x0], #(16) // ..................................................*. + + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s new file mode 100644 index 0000000..9129d2f --- /dev/null +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s @@ -0,0 +1,1470 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_manual_ld4_opt_m1_icestorm + .global _intt_kyber_123_4567_manual_ld4_opt_m1_icestorm + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_manual_ld4_opt_m1_icestorm: +_intt_kyber_123_4567_manual_ld4_opt_m1_icestorm: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 + ldr q27, [x4, #16] // ........*....................................... + ldr q6, [x3], #16 // .............................................*.. + ld4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x1] // *............................................... + // gap // ................................................ + ldr q9, [x4], #(6*16) // .*.............................................. + ldr q10, [x4, #-32] // ...*............................................ + // gap // ................................................ + // gap // ................................................ + ldr q24, [x4, #-48] // ..*............................................. + ldr q16, [x4, #-64] // ....*........................................... + // gap // ................................................ + // gap // ................................................ + ldr q13, [x4, #-16] // .....*.......................................... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + sub v21.8H, v1.8H, v2.8H // .......*........................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + sub v26.8H, v3.8H, v4.8H // .........*...................................... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + mul v8.8H, v21.8H, v16.8H // ...........*.................................... + sqrdmulh v22.8H, v21.8H, v24.8H // ..........*..................................... + // gap // ................................................ + // gap // ................................................ + sqrdmulh v13.8H, v26.8H, v13.8H // .............*.................................. + mul v14.8H, v26.8H, v10.8H // ............*................................... + // gap // ................................................ + // gap // ................................................ + add v26.8H, v1.8H, v2.8H // ......*......................................... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + mls v8.8H, v22.8H, v7.H[0] // ...............*................................ + add v20.8H, v3.8H, v4.8H // ..............*................................. + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + mls v14.8H, v13.8H, v7.H[0] // ................*............................... + add v30.8H, v26.8H, v20.8H // ..................*............................. + sub v18.8H, v26.8H, v20.8H // .................*.............................. + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + add v16.8H, v8.8H, v14.8H // ........................*....................... + sub v8.8H, v8.8H, v14.8H // ....................*........................... + // gap // ................................................ + // gap // ................................................ + mul v11.8H, v18.8H, v9.8H // .....................*.......................... + sqrdmulh v31.8H, v18.8H, v27.8H // ...................*............................ + // gap // ................................................ + // gap // ................................................ + mul v19.8H, v8.8H, v9.8H // ......................*......................... + sqrdmulh v29.8H, v8.8H, v27.8H // .......................*........................ + // gap // ................................................ + // gap // ................................................ + trn1 v9.4S, v30.4S, v16.4S // ...........................*.................... + trn2 v18.4S, v30.4S, v16.4S // ............................*................... + // gap // ................................................ + // gap // ................................................ + mls v11.8H, v31.8H, v7.H[0] // .........................*...................... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + mls v19.8H, v29.8H, v7.H[0] // ..........................*..................... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + trn2 v21.4S, v11.4S, v19.4S // .............................*.................. + trn1 v15.4S, v11.4S, v19.4S // ..............................*................. + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + trn1 v25.2D, v18.2D, v21.2D // .................................*.............. + trn1 v27.2D, v9.2D, v15.2D // ...............................*................ + // gap // ................................................ + // gap // ................................................ + trn2 v1.2D, v18.2D, v21.2D // ..................................*............. + trn2 v14.2D, v9.2D, v15.2D // ................................*............... + // gap // ................................................ + // gap // ................................................ + add v29.8H, v27.8H, v25.8H // ...................................*............ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + add v23.8H, v14.8H, v1.8H // ....................................*........... + sub v12.8H, v14.8H, v1.8H // ..........................................*..... + // gap // ................................................ + // gap // ................................................ + sqdmulh v28.8H, v29.8H, v7.H[1] // ......................................*......... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + sqdmulh v30.8H, v23.8H, v7.H[1] // .......................................*........ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + srshr v5.8H, v28.8H, #11 // .........................................*...... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + srshr v9.8H, v30.8H, #11 // ........................................*....... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + mls v29.8H, v5.8H, v7.H[0] // ...........................................*.... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + mls v23.8H, v9.8H, v7.H[0] // ............................................*... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + add v26.8H, v29.8H, v23.8H // ..............................................*. + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + str q26, [x1], #(64) // ...............................................* + sub v26.8H, v27.8H, v25.8H // .....................................*.......... + // gap // ................................................ + // gap // ................................................ + + // original source code + // ld4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x1] // ..*............................................. + // ldr q5, [x4], #(6*16) // ...*............................................ + // ldr q23, [x4, #-48] // .....*.......................................... + // ldr q20, [x4, #-32] // ....*........................................... + // ldr q8, [x4, #-64] // ......*......................................... + // ldr q10, [x4, #-16] // .......*........................................ + // add v21.8H, v0.8H, v1.8H // ..............*................................. + // sub v0.8H, v0.8H, v1.8H // ........*....................................... + // ldr q15, [x4, #-80] // *............................................... + // sub v4.8H, v2.8H, v3.8H // .........*...................................... + // sqrdmulh v13.8H, v0.8H, v23.8H // ...........*.................................... + // mul v1.8H, v0.8H, v8.8H // ..........*..................................... + // mul v28.8H, v4.8H, v20.8H // .............*.................................. + // sqrdmulh v19.8H, v4.8H, v10.8H // ............*................................... + // add v0.8H, v2.8H, v3.8H // ................*............................... + // mls v1.8H, v13.8H, v7.H[0] // ...............*................................ + // mls v28.8H, v19.8H, v7.H[0] // .................*.............................. + // sub v8.8H, v21.8H, v0.8H // ...................*............................ + // add v25.8H, v21.8H, v0.8H // ..................*............................. + // sqrdmulh v16.8H, v8.8H, v15.8H // .......................*........................ + // sub v22.8H, v1.8H, v28.8H // .....................*.......................... + // mul v4.8H, v8.8H, v5.8H // ......................*......................... + // mul v21.8H, v22.8H, v5.8H // ........................*....................... + // sqrdmulh v24.8H, v22.8H, v15.8H // .........................*...................... + // add v15.8H, v1.8H, v28.8H // ....................*........................... + // mls v4.8H, v16.8H, v7.H[0] // ............................*................... + // mls v21.8H, v24.8H, v7.H[0] // .............................*.................. + // trn1 v30.4S, v25.4S, v15.4S // ..........................*..................... + // trn2 v9.4S, v25.4S, v15.4S // ...........................*.................... + // trn2 v16.4S, v4.4S, v21.4S // ..............................*................. + // trn1 v21.4S, v4.4S, v21.4S // ...............................*................ + // trn1 v8.2D, v30.2D, v21.2D // .................................*.............. + // trn2 v30.2D, v30.2D, v21.2D // ...................................*............ + // trn1 v25.2D, v9.2D, v16.2D // ................................*............... + // trn2 v9.2D, v9.2D, v16.2D // ..................................*............. + // add v29.8H, v8.8H, v25.8H // ....................................*........... + // add v23.8H, v30.8H, v9.8H // .....................................*.......... + // sub v26.8H, v8.8H, v25.8H // ...............................................* + // sqdmulh v8.8H, v29.8H, v7.H[1] // .......................................*........ + // sqdmulh v21.8H, v23.8H, v7.H[1] // ........................................*....... + // srshr v4.8H, v21.8H, #11 // ..........................................*..... + // srshr v21.8H, v8.8H, #11 // .........................................*...... + // sub v12.8H, v30.8H, v9.8H // ......................................*......... + // mls v29.8H, v21.8H, v7.H[0] // ...........................................*.... + // mls v23.8H, v4.8H, v7.H[0] // ............................................*... + // ldr q6, [x3], #16 // .*.............................................. + // add v2.8H, v29.8H, v23.8H // .............................................*.. + // str q2, [x1], #(64) // ..............................................*. + + sub count, count, #1 +layer4567_start: + ld4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x1] // e................................................................. + ldr q5, [x4], #(6*16) // .e................................................................ + mul v27.8H, v12.8H, v6.H[4] // ...........................................*...................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sqrdmulh v31.8H, v12.8H, v6.H[5] // ............................................*..................... + sub v18.8H, v29.8H, v23.8H // ....................................................*............. + // gap // .................................................................. + // gap // .................................................................. + ldr q23, [x4, #-48] // ....e............................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + ldr q20, [x4, #-32] // .....e............................................................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + ldr q8, [x4, #-64] // ...e.............................................................. + mls v27.8H, v31.8H, v7.H[0] // .............................................*.................... + ldr q10, [x4, #-16] // ......e........................................................... + // gap // .................................................................. + // gap // .................................................................. + add v21.8H, v0.8H, v1.8H // ........e......................................................... + sub v0.8H, v0.8H, v1.8H // .......e.......................................................... + ldr q15, [x4, #-80] // ..e............................................................... + // gap // .................................................................. + sub v4.8H, v2.8H, v3.8H // ............e..................................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sqrdmulh v13.8H, v0.8H, v23.8H // ..........e....................................................... + mul v1.8H, v0.8H, v8.8H // .........e........................................................ + // gap // .................................................................. + // gap // .................................................................. + mul v28.8H, v4.8H, v20.8H // ..............e................................................... + sqrdmulh v19.8H, v4.8H, v10.8H // ...............e.................................................. + // gap // .................................................................. + // gap // .................................................................. + add v0.8H, v2.8H, v3.8H // .............e.................................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mls v1.8H, v13.8H, v7.H[0] // ...........e...................................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mls v28.8H, v19.8H, v7.H[0] // ................e................................................. + // gap // .................................................................. + // gap // .................................................................. + sub v8.8H, v21.8H, v0.8H // .................e................................................ + add v25.8H, v21.8H, v0.8H // ..................e............................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sqrdmulh v16.8H, v8.8H, v15.8H // ....................e............................................. + // gap // .................................................................. + // gap // .................................................................. + mul v14.8H, v18.8H, v6.H[0] // ......................................................*........... + sub v22.8H, v1.8H, v28.8H // ......................e........................................... + mul v4.8H, v8.8H, v5.8H // ...................e.............................................. + sqrdmulh v8.8H, v26.8H, v6.H[3] // .......................................*.......................... + // gap // .................................................................. + // gap // .................................................................. + mul v21.8H, v22.8H, v5.8H // ........................e......................................... + sqrdmulh v24.8H, v22.8H, v15.8H // .........................e........................................ + // gap // .................................................................. + // gap // .................................................................. + add v15.8H, v1.8H, v28.8H // .......................e.......................................... + mul v17.8H, v26.8H, v6.H[2] // ......................................*........................... + // gap // .................................................................. + // gap // .................................................................. + sqrdmulh v28.8H, v18.8H, v6.H[1] // .......................................................*.......... + mls v4.8H, v16.8H, v7.H[0] // .....................e............................................ + // gap // .................................................................. + // gap // .................................................................. + mls v21.8H, v24.8H, v7.H[0] // ..........................e....................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mls v17.8H, v8.8H, v7.H[0] // ........................................*......................... + // gap // .................................................................. + // gap // .................................................................. + trn1 v30.4S, v25.4S, v15.4S // ...........................e...................................... + trn2 v9.4S, v25.4S, v15.4S // ............................e..................................... + mls v14.8H, v28.8H, v7.H[0] // ........................................................*......... + // gap // .................................................................. + // gap // .................................................................. + trn2 v16.4S, v4.4S, v21.4S // ..............................e................................... + // gap // .................................................................. + // gap // .................................................................. + trn1 v21.4S, v4.4S, v21.4S // .............................e.................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sub v19.8H, v17.8H, v27.8H // .........................................................*........ + // gap // .................................................................. + trn1 v8.2D, v30.2D, v21.2D // .................................e................................ + trn2 v30.2D, v30.2D, v21.2D // ...............................e.................................. + // gap // .................................................................. + trn1 v25.2D, v9.2D, v16.2D // ..................................e............................... + trn2 v9.2D, v9.2D, v16.2D // ................................e................................. + // gap // .................................................................. + // gap // .................................................................. + mul v15.8H, v19.8H, v6.H[0] // ...........................................................*...... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + add v29.8H, v8.8H, v25.8H // .....................................e............................ + // gap // .................................................................. + // gap // .................................................................. + add v23.8H, v30.8H, v9.8H // ..........................................e....................... + sub v26.8H, v8.8H, v25.8H // ....................................e............................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sqdmulh v8.8H, v29.8H, v7.H[1] // ..............................................e................... + // gap // .................................................................. + // gap // .................................................................. + sqdmulh v21.8H, v23.8H, v7.H[1] // .................................................e................ + sqrdmulh v0.8H, v19.8H, v6.H[1] // ............................................................*..... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + add v12.8H, v17.8H, v27.8H // ..........................................................*....... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + srshr v4.8H, v21.8H, #11 // ..................................................e............... + srshr v21.8H, v8.8H, #11 // ...............................................e.................. + // gap // .................................................................. + // gap // .................................................................. + mls v15.8H, v0.8H, v7.H[0] // .............................................................*.... + str q12, [x1, #-48] // ...............................................................*.. + // gap // .................................................................. + // gap // .................................................................. + str q14, [x1, #-32] // ................................................................*. + sub v12.8H, v30.8H, v9.8H // .........................................e........................ + // gap // .................................................................. + // gap // .................................................................. + mls v29.8H, v21.8H, v7.H[0] // ................................................e................. + mls v23.8H, v4.8H, v7.H[0] // ...................................................e.............. + // gap // .................................................................. + // gap // .................................................................. + str q15, [x1, #-16] // .................................................................* + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + ldr q6, [x3], #16 // ...................................e.............................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + add v2.8H, v29.8H, v23.8H // .....................................................e............ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + str q2, [x1], #(64) // ..............................................................e... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + + // original source code + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // e.................................................................e.............................................................. + // ldr q0, [x4], #(6*16) // .e................................................................|e............................................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // ............e.....................................................|...........e.................................................. + // ldr q1, [x4, #(-6*16 + 2*16)] // .......e..........................................................|......e....................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // .....e............................................................|....e......................................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // ......e...........................................................|.....e........................................................ + // ldr q6, [x4, #(-6*16 + 5*16)] // .........e........................................................|........e..................................................... + // sub v24.8h, v8.8h, v9.8h // ...........e......................................................|..........e................................................... + // add v8.8h, v8.8h, v9.8h // ..........e.......................................................|.........e.................................................... + // mul v9.8h, v24.8h, v1.8h // ...............e..................................................|..............e............................................... + // sqrdmulh v24.8h, v24.8h, v5.8h // ..............e...................................................|.............e................................................ + // mls v9.8h, v24.8h, v7.h[0] // ...................e..............................................|..................e........................................... + // sub v24.8h, v10.8h, v11.8h // .............e....................................................|............e................................................. + // add v10.8h, v10.8h, v11.8h // ..................e...............................................|.................e............................................ + // mul v11.8h, v24.8h, v2.8h // ................e.................................................|...............e.............................................. + // sqrdmulh v24.8h, v24.8h, v6.8h // .................e................................................|................e............................................. + // mls v11.8h, v24.8h, v7.h[0] // ....................e.............................................|...................e.......................................... + // sub v24.8h, v8.8h, v10.8h // .....................e............................................|....................e......................................... + // add v8.8h, v8.8h, v10.8h // ......................e...........................................|.....................e........................................ + // mul v10.8h, v24.8h, v0.8h // ..........................e.......................................|.........................e.................................... + // sqrdmulh v24.8h, v24.8h, v4.8h // .......................e..........................................|......................e....................................... + // mls v10.8h, v24.8h, v7.h[0] // .................................e................................|................................e............................. + // sub v24.8h, v9.8h, v11.8h // .........................e........................................|........................e..................................... + // add v9.8h, v9.8h, v11.8h // ..............................e...................................|.............................e................................ + // mul v11.8h, v24.8h, v0.8h // ............................e.....................................|...........................e.................................. + // sqrdmulh v24.8h, v24.8h, v4.8h // .............................e....................................|............................e................................. + // mls v11.8h, v24.8h, v7.h[0] // ..................................e...............................|.................................e............................ + // trn1 v25.4s, v8.4s, v9.4s // ....................................e.............................|...................................e.......................... + // trn2 v26.4s, v8.4s, v9.4s // .....................................e............................|....................................e......................... + // trn1 v27.4s, v10.4s, v11.4s // ........................................e.........................|.......................................e...................... + // trn2 v28.4s, v10.4s, v11.4s // .......................................e..........................|......................................e....................... + // trn2 v10.2d, v25.2d, v27.2d // ...........................................e......................|..........................................e................... + // trn2 v11.2d, v26.2d, v28.2d // .............................................e....................|............................................e................. + // trn1 v8.2d, v25.2d, v27.2d // ..........................................e.......................|.........................................e.................... + // trn1 v9.2d, v26.2d, v28.2d // ............................................e.....................|...........................................e.................. + // ldr q0, [x3], #16 // ...............................................................e..|.............................................................. + // sub v24.8h, v8.8h, v9.8h // .................................................e................|................................................e............. + // add v8.8h, v8.8h, v9.8h // ...............................................e..................|..............................................e............... + // mul v9.8h, v24.8h, v0.h[2] // ...............................*..................................|..............................*............................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........................*......................................|..........................*................................... + // mls v9.8h, v24.8h, v7.h[0] // ...................................*..............................|..................................*........................... + // sub v24.8h, v10.8h, v11.8h // ...........................................................e......|..........................................................e... + // add v10.8h, v10.8h, v11.8h // ................................................e.................|...............................................e.............. + // mul v11.8h, v24.8h, v0.h[4] // ..*...............................................................|.*............................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...*..............................................................|..*........................................................... + // mls v11.8h, v24.8h, v7.h[0] // ........*.........................................................|.......*...................................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ..................................................e...............|.................................................e............ + // srshr v25.8h, v25.8h, #11 // .......................................................e..........|......................................................e....... + // mls v8.8h, v25.8h, v7.h[0] // ............................................................e.....|...........................................................e.. + // sqdmulh v25.8h, v10.8h, v7.h[1] // ...................................................e..............|..................................................e........... + // srshr v25.8h, v25.8h, #11 // ......................................................e...........|.....................................................e........ + // mls v10.8h, v25.8h, v7.h[0] // .............................................................e....|............................................................e. + // sub v24.8h, v8.8h, v10.8h // ....*.............................................................|...*.......................................................... + // add v8.8h, v8.8h, v10.8h // ................................................................e.|.............................................................. + // mul v10.8h, v24.8h, v0.h[0] // ........................*.........................................|.......................*...................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ................................*.................................|...............................*.............................. + // mls v10.8h, v24.8h, v7.h[0] // ......................................*...........................|.....................................*........................ + // sub v24.8h, v9.8h, v11.8h // .........................................*........................|........................................*..................... + // add v9.8h, v9.8h, v11.8h // .....................................................*............|....................................................*......... + // mul v11.8h, v24.8h, v0.h[0] // ..............................................*...................|.............................................*................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ....................................................*.............|...................................................*.......... + // mls v11.8h, v24.8h, v7.h[0] // ........................................................*.........|.......................................................*...... + // str q8, [x1], #(64) // .................................................................e|.............................................................. + // str q9, [x1, #(-64 + 16*1)] // .........................................................*........|........................................................*..... + // str q10, [x1, #(-64 + 16*2)] // ..........................................................*.......|.........................................................*.... + // str q11, [x1, #(-64 + 16*3)] // ..............................................................*...|.............................................................* + + sub count, count, #1 + cbnz count, layer4567_start + mul v4.8H, v26.8H, v6.H[2] // ......*........... + sqrdmulh v9.8H, v26.8H, v6.H[3] // .....*............ + // gap // .................. + // gap // .................. + sqrdmulh v5.8H, v12.8H, v6.H[5] // .*................ + mul v12.8H, v12.8H, v6.H[4] // *................. + // gap // .................. + // gap // .................. + sub v26.8H, v29.8H, v23.8H // ..*............... + // gap // .................. + // gap // .................. + // gap // .................. + mls v4.8H, v9.8H, v7.H[0] // ........*......... + // gap // .................. + // gap // .................. + // gap // .................. + mls v12.8H, v5.8H, v7.H[0] // ...*.............. + mul v22.8H, v26.8H, v6.H[0] // ....*............. + // gap // .................. + // gap // .................. + sqrdmulh v29.8H, v26.8H, v6.H[1] // .......*.......... + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + sub v1.8H, v4.8H, v12.8H // ..........*....... + add v27.8H, v4.8H, v12.8H // .............*.... + // gap // .................. + // gap // .................. + mls v22.8H, v29.8H, v7.H[0] // .........*........ + // gap // .................. + // gap // .................. + // gap // .................. + sqrdmulh v5.8H, v1.8H, v6.H[1] // ............*..... + mul v6.8H, v1.8H, v6.H[0] // ...........*...... + str q27, [x1, #-48] // ...............*.. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + str q22, [x1, #-32] // ................*. + // gap // .................. + // gap // .................. + // gap // .................. + mls v6.8H, v5.8H, v7.H[0] // ..............*... + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + str q6, [x1, #-16] // .................* + // gap // .................. + // gap // .................. + // gap // .................. + + // original source code + // mul v27.8H, v12.8H, v6.H[4] // ...*.............. + // sqrdmulh v31.8H, v12.8H, v6.H[5] // ..*............... + // sub v18.8H, v29.8H, v23.8H // ....*............. + // mls v27.8H, v31.8H, v7.H[0] // ......*........... + // mul v14.8H, v18.8H, v6.H[0] // .......*.......... + // sqrdmulh v8.8H, v26.8H, v6.H[3] // .*................ + // mul v17.8H, v26.8H, v6.H[2] // *................. + // sqrdmulh v28.8H, v18.8H, v6.H[1] // ........*......... + // mls v17.8H, v8.8H, v7.H[0] // .....*............ + // mls v14.8H, v28.8H, v7.H[0] // ...........*...... + // sub v19.8H, v17.8H, v27.8H // .........*........ + // mul v15.8H, v19.8H, v6.H[0] // .............*.... + // sqrdmulh v0.8H, v19.8H, v6.H[1] // ............*..... + // add v12.8H, v17.8H, v27.8H // ..........*....... + // mls v15.8H, v0.8H, v7.H[0] // ................*. + // str q12, [x1, #-48] // ..............*... + // str q14, [x1, #-32] // ...............*.. + // str q15, [x1, #-16] // .................* + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + ldr q31, [x0, #256] // .....*........................ + ldr q20, [x0, #320] // *............................. + // gap // .............................. + // gap // .............................. + ldr q13, [x0, #128] // ...*.......................... + ldr q26, [x0, #192] // ....*......................... + // gap // .............................. + // gap // .............................. + ldr q23, [x0, #0] // .......*...................... + // gap // .............................. + // gap // .............................. + ldr q3, [x0, #64] // .*............................ + ldr q11, [x0, #384] // ......*....................... + // gap // .............................. + // gap // .............................. + ldr q15, [x0, #448] // ..*........................... + sub v2.8H, v31.8H, v20.8H // ..........*................... + add v21.8H, v31.8H, v20.8H // .........*.................... + // gap // .............................. + // gap // .............................. + sub v20.8H, v13.8H, v26.8H // ........................*..... + add v5.8H, v13.8H, v26.8H // ............*................. + // gap // .............................. + // gap // .............................. + mul v31.8H, v2.8H, v1.H[2] // ...................*.......... + add v16.8H, v23.8H, v3.8H // ........*..................... + // gap // .............................. + // gap // .............................. + sqrdmulh v2.8H, v2.8H, v1.H[3] // .................*............ + add v28.8H, v11.8H, v15.8H // ...........*.................. + // gap // .............................. + // gap // .............................. + sub v17.8H, v16.8H, v5.8H // ................*............. + // gap // .............................. + // gap // .............................. + mul v10.8H, v20.8H, v1.H[0] // ............................*. + add v8.8H, v16.8H, v5.8H // .....................*........ + // gap // .............................. + // gap // .............................. + add v13.8H, v21.8H, v28.8H // .............*................ + sub v5.8H, v23.8H, v3.8H // ..............*............... + mul v3.8H, v17.8H, v0.H[2] // .......................*...... + // gap // .............................. + // gap // .............................. + sqdmulh v16.8H, v13.8H, v7.H[1] // ...............*.............. + mls v31.8H, v2.8H, v7.H[0] // ......................*....... + // gap // .............................. + // gap // .............................. + mul v9.8H, v5.8H, v0.H[6] // ..................*........... + // gap // .............................. + // gap // .............................. + sqdmulh v24.8H, v8.8H, v7.H[1] // ...........................*.. + sqrdmulh v19.8H, v5.8H, v0.H[7] // ..........................*... + // gap // .............................. + // gap // .............................. + sqrdmulh v20.8H, v20.8H, v1.H[1] // .............................* + // gap // .............................. + // gap // .............................. + sqrdmulh v23.8H, v17.8H, v0.H[3] // .........................*.... + srshr v16.8H, v16.8H, #11 // ....................*......... + + // original source code + // ldr q12, [x0, #320] // .*............................ + // ldr q14, [x0, #64] // .....*........................ + // ldr q15, [x0, #448] // .......*...................... + // ldr q4, [x0, #128] // ..*........................... + // ldr q5, [x0, #192] // ...*.......................... + // ldr q17, [x0, #256] // *............................. + // ldr q11, [x0, #384] // ......*....................... + // ldr q10, [x0, #0] // ....*......................... + // add v27.8H, v10.8H, v14.8H // .............*................ + // add v21.8H, v17.8H, v12.8H // .........*.................... + // sub v12.8H, v17.8H, v12.8H // ........*..................... + // add v28.8H, v11.8H, v15.8H // ...............*.............. + // add v18.8H, v4.8H, v5.8H // ...........*.................. + // add v13.8H, v21.8H, v28.8H // ...................*.......... + // sub v17.8H, v10.8H, v14.8H // ....................*......... + // sqdmulh v8.8H, v13.8H, v7.H[1] // ......................*....... + // sub v10.8H, v27.8H, v18.8H // ................*............. + // sqrdmulh v19.8H, v12.8H, v1.H[3] // ..............*............... + // mul v9.8H, v17.8H, v0.H[6] // ........................*..... + // mul v31.8H, v12.8H, v1.H[2] // ............*................. + // srshr v16.8H, v8.8H, #11 // .............................* + // add v8.8H, v27.8H, v18.8H // ..................*........... + // mls v31.8H, v19.8H, v7.H[0] // .......................*...... + // mul v3.8H, v10.8H, v0.H[2] // .....................*........ + // sub v4.8H, v4.8H, v5.8H // ..........*................... + // sqrdmulh v23.8H, v10.8H, v0.H[3] // ............................*. + // sqrdmulh v19.8H, v17.8H, v0.H[7] // ..........................*... + // sqdmulh v24.8H, v8.8H, v7.H[1] // .........................*.... + // mul v10.8H, v4.8H, v1.H[0] // .................*............ + // sqrdmulh v20.8H, v4.8H, v1.H[1] // ...........................*.. + + sub count, count, #1 +layer123_start: + ldr q12, [x0, #336] // .....e........................................................................................ + ldr q14, [x0, #80] // .e............................................................................................ + mls v3.8H, v23.8H, v7.H[0] // ................................*............................................................. + sub v5.8H, v11.8H, v15.8H // .......................*...................................................................... + mls v9.8H, v19.8H, v7.H[0] // ............*................................................................................. + srshr v26.8H, v24.8H, #11 // .................................................*............................................ + ldr q15, [x0, #464] // .......e...................................................................................... + // gap // .............................................................................................. + mul v6.8H, v5.8H, v1.H[4] // .........................*.................................................................... + sqrdmulh v2.8H, v5.8H, v1.H[5] // ..........................*................................................................... + // gap // .............................................................................................. + ldr q4, [x0, #144] // ..e........................................................................................... + // gap // .............................................................................................. + sub v24.8H, v21.8H, v28.8H // ......................................*....................................................... + ldr q5, [x0, #208] // ...e.......................................................................................... + mls v10.8H, v20.8H, v7.H[0] // .................*............................................................................ + mls v8.8H, v26.8H, v7.H[0] // ..................................................*........................................... + ldr q17, [x0, #272] // ....e......................................................................................... + // gap // .............................................................................................. + mls v13.8H, v16.8H, v7.H[0] // .....................................................*........................................ + mls v6.8H, v2.8H, v7.H[0] // ...........................*.................................................................. + mul v16.8H, v24.8H, v0.H[4] // ........................................*..................................................... + ldr q11, [x0, #400] // ......e....................................................................................... + // gap // .............................................................................................. + sqrdmulh v19.8H, v24.8H, v0.H[5] // .........................................*.................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v28.8H, v9.8H, v10.8H // .................................*............................................................ + add v20.8H, v9.8H, v10.8H // ..................................*........................................................... + add v18.8H, v8.8H, v13.8H // .......................................................*...................................... + ldr q10, [x0, #16] // e............................................................................................. + // gap // .............................................................................................. + sub v2.8H, v8.8H, v13.8H // ......................................................*....................................... + add v26.8H, v31.8H, v6.8H // ............................................*................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v16.8H, v19.8H, v7.H[0] // ..........................................*................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v13.8H, v18.8H, v30.8H // ...............................................................................*.............. + sub v8.8H, v31.8H, v6.8H // ...........................................*.................................................. + mul v31.8H, v18.8H, v29.8H // ..............................................................................*............... + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v24.8H, v28.8H, v0.H[2] // ...................................*.......................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v22.8H, v2.8H, v0.H[0] // ........................................................*..................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v27.8H, v2.8H, v0.H[1] // .........................................................*.................................... + add v2.8H, v3.8H, v16.8H // .................................................................*............................ + sub v25.8H, v3.8H, v16.8H // ................................................................*............................. + mls v31.8H, v13.8H, v7.H[0] // ................................................................................*............. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v13.8H, v8.8H, v0.H[4] // .............................................*................................................ + sqrdmulh v23.8H, v2.8H, v30.8H // .....................................................................................*........ + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v8.8H, v8.8H, v0.H[5] // ..............................................*............................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v3.8H, v25.8H, v0.H[1] // ...................................................................*.......................... + mls v22.8H, v27.8H, v7.H[0] // ..........................................................*................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v19.8H, v28.8H, v0.H[3] // ....................................*......................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v6.8H, v25.8H, v0.H[0] // ..................................................................*........................... + sub v25.8H, v20.8H, v26.8H // ...........................................................*.................................. + add v26.8H, v20.8H, v26.8H // ............................................................*................................. + mls v13.8H, v8.8H, v7.H[0] // ...............................................*.............................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v24.8H, v19.8H, v7.H[0] // .....................................*........................................................ + add v27.8H, v10.8H, v14.8H // .........e.................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + add v21.8H, v17.8H, v12.8H // ...................e.......................................................................... + sqrdmulh v16.8H, v26.8H, v30.8H // ..................................................................................*........... + sub v12.8H, v17.8H, v12.8H // ..................e........................................................................... + mul v8.8H, v26.8H, v29.8H // .................................................................................*............ + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v19.8H, v24.8H, v13.8H // .....................................................................*........................ + // gap // .............................................................................................. + // gap // .............................................................................................. + add v28.8H, v11.8H, v15.8H // ........................e..................................................................... + mul v2.8H, v2.8H, v29.8H // ....................................................................................*......... + add v24.8H, v24.8H, v13.8H // ......................................................................*....................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v8.8H, v16.8H, v7.H[0] // ...................................................................................*.......... + add v18.8H, v4.8H, v5.8H // ..............e............................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + add v13.8H, v21.8H, v28.8H // .......................................e...................................................... + sqrdmulh v20.8H, v24.8H, v30.8H // ........................................................................................*..... + sub v17.8H, v10.8H, v14.8H // ........e..................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v24.8H, v24.8H, v29.8H // .......................................................................................*...... + str q8, [x0, #64] // ...........................................................................................*.. + mul v14.8H, v19.8H, v0.H[0] // .......................................................................*...................... + // gap // .............................................................................................. + sqdmulh v8.8H, v13.8H, v7.H[1] // ...................................................e.......................................... + str q31, [x0], #(16) // ..........................................................................................*... + sqrdmulh v16.8H, v19.8H, v0.H[1] // ........................................................................*..................... + // gap // .............................................................................................. + sub v10.8H, v27.8H, v18.8H // ............................e................................................................. + mls v24.8H, v20.8H, v7.H[0] // .........................................................................................*.... + sqrdmulh v19.8H, v12.8H, v1.H[3] // .....................e........................................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v20.8H, v25.8H, v0.H[1] // ..............................................................*............................... + mul v9.8H, v17.8H, v0.H[6] // ..........e................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v31.8H, v12.8H, v1.H[2] // ....................e......................................................................... + mls v14.8H, v16.8H, v7.H[0] // .........................................................................*.................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v26.8H, v25.8H, v0.H[0] // .............................................................*................................ + srshr v16.8H, v8.8H, #11 // ....................................................e......................................... + str q24, [x0, #176] // .............................................................................................* + // gap // .............................................................................................. + mls v2.8H, v23.8H, v7.H[0] // ......................................................................................*....... + add v8.8H, v27.8H, v18.8H // .............................e................................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v6.8H, v3.8H, v7.H[0] // ....................................................................*......................... + mls v31.8H, v19.8H, v7.H[0] // ......................e....................................................................... + str q14, [x0, #432] // .............................................................................*................ + // gap // .............................................................................................. + str q22, [x0, #240] // ..........................................................................*................... + mls v26.8H, v20.8H, v7.H[0] // ...............................................................*.............................. + mul v3.8H, v10.8H, v0.H[2] // ..............................e............................................................... + // gap // .............................................................................................. + sub v4.8H, v4.8H, v5.8H // .............e................................................................................ + sqrdmulh v23.8H, v10.8H, v0.H[3] // ...............................e.............................................................. + str q2, [x0, #112] // ............................................................................................*. + // gap // .............................................................................................. + sqrdmulh v19.8H, v17.8H, v0.H[7] // ...........e.................................................................................. + str q6, [x0, #368] // ............................................................................*................. + // gap // .............................................................................................. + sqdmulh v24.8H, v8.8H, v7.H[1] // ................................................e............................................. + str q26, [x0, #304] // ...........................................................................*.................. + mul v10.8H, v4.8H, v1.H[0] // ...............e.............................................................................. + sqrdmulh v20.8H, v4.8H, v1.H[1] // ................e............................................................................. + // gap // .............................................................................................. + + // original source code + // ldr q8, [x0, #0] // .......................e......................................................................|......................e.................................................................... + // ldr q9, [x0, #(1*(512/8))] // .e............................................................................................|e.......................................................................................... + // ldr q10, [x0, #(2*(512/8))] // .........e....................................................................................|........e.................................................................................. + // ldr q11, [x0, #(3*(512/8))] // ...........e..................................................................................|..........e................................................................................ + // ldr q12, [x0, #(4*(512/8))] // ..............e...............................................................................|.............e............................................................................. + // ldr q13, [x0, #(5*(512/8))] // e.............................................................................................e........................................................................................... + // ldr q14, [x0, #(6*(512/8))] // ..................e...........................................................................|.................e......................................................................... + // ldr q15, [x0, #(7*(512/8))] // ......e.......................................................................................|.....e..................................................................................... + // sub v24.8h, v8.8h, v9.8h // ............................................................e.................................|...........................................................e............................... + // add v8.8h, v8.8h, v9.8h // ...............................................e..............................................|..............................................e............................................ + // mul v9.8h, v24.8h, v0.h[6] // .......................................................................e......................|......................................................................e.................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // ........................................................................................e.....|.......................................................................................e... + // mls v9.8h, v24.8h, v7.h[0] // ....*.........................................................................................|...*....................................................................................... + // sub v24.8h, v10.8h, v11.8h // .....................................................................................e........|....................................................................................e...... + // add v10.8h, v10.8h, v11.8h // .........................................................e....................................|........................................................e.................................. + // mul v11.8h, v24.8h, v1.h[0] // ............................................................................................e.|........................................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[1] // .............................................................................................e|........................................................................................... + // mls v11.8h, v24.8h, v7.h[0] // ............*.................................................................................|...........*............................................................................... + // sub v24.8h, v12.8h, v13.8h // ..................................................e...........................................|.................................................e......................................... + // add v12.8h, v12.8h, v13.8h // ................................................e.............................................|...............................................e........................................... + // mul v13.8h, v24.8h, v1.h[2] // ........................................................................e.....................|.......................................................................e................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // .....................................................................e........................|....................................................................e...................... + // mls v13.8h, v24.8h, v7.h[0] // ................................................................................e.............|...............................................................................e........... + // sub v24.8h, v14.8h, v15.8h // ...*..........................................................................................|..*........................................................................................ + // add v14.8h, v14.8h, v15.8h // .....................................................e........................................|....................................................e...................................... + // mul v15.8h, v24.8h, v1.h[4] // .......*......................................................................................|......*.................................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[5] // ........*.....................................................................................|.......*................................................................................... + // mls v15.8h, v24.8h, v7.h[0] // ................*.............................................................................|...............*........................................................................... + // sub v24.8h, v8.8h, v10.8h // ...................................................................e..........................|..................................................................e........................ + // add v8.8h, v8.8h, v10.8h // ..............................................................................e...............|.............................................................................e............. + // mul v10.8h, v24.8h, v0.h[2] // ....................................................................................e.........|...................................................................................e....... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ......................................................................................e.......|.....................................................................................e..... + // mls v10.8h, v24.8h, v7.h[0] // ..*...........................................................................................|.*......................................................................................... + // sub v24.8h, v9.8h, v11.8h // ....................*.........................................................................|...................*....................................................................... + // add v9.8h, v9.8h, v11.8h // .....................*........................................................................|....................*...................................................................... + // mul v11.8h, v24.8h, v0.h[2] // ..............................*...............................................................|.............................*............................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .........................................*....................................................|........................................*.................................................. + // mls v11.8h, v24.8h, v7.h[0] // ..............................................*...............................................|.............................................*............................................. + // sub v24.8h, v12.8h, v14.8h // ..........*...................................................................................|.........*................................................................................. + // add v12.8h, v12.8h, v14.8h // ..........................................................e...................................|.........................................................e................................. + // mul v14.8h, v24.8h, v0.h[4] // .................*............................................................................|................*.......................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...................*..........................................................................|..................*........................................................................ + // mls v14.8h, v24.8h, v7.h[0] // ..........................*...................................................................|.........................*................................................................. + // sub v24.8h, v13.8h, v15.8h // ............................*.................................................................|...........................*............................................................... + // add v13.8h, v13.8h, v15.8h // .........................*....................................................................|........................*.................................................................. + // mul v15.8h, v24.8h, v0.h[4] // ....................................*.........................................................|...................................*....................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ......................................*.......................................................|.....................................*..................................................... + // mls v15.8h, v24.8h, v7.h[0] // .............................................*................................................|............................................*.............................................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // ..........................................................................................e...|.........................................................................................e. + // srshr v25.8h, v25.8h, #11 // .....*........................................................................................|....*...................................................................................... + // mls v8.8h, v25.8h, v7.h[0] // .............*................................................................................|............*.............................................................................. + // sqdmulh v25.8h, v12.8h, v7.h[1] // ................................................................e.............................|...............................................................e........................... + // srshr v25.8h, v25.8h, #11 // ...........................................................................e..................|..........................................................................e................ + // mls v12.8h, v25.8h, v7.h[0] // ...............*..............................................................................|..............*............................................................................ + // sub v24.8h, v8.8h, v12.8h // ........................*.....................................................................|.......................*................................................................... + // add v8.8h, v8.8h, v12.8h // ......................*.......................................................................|.....................*..................................................................... + // mul v12.8h, v24.8h, v0.h[0] // ...............................*..............................................................|..............................*............................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ................................*.............................................................|...............................*........................................................... + // mls v12.8h, v24.8h, v7.h[0] // ........................................*.....................................................|.......................................*................................................... + // sub v24.8h, v9.8h, v13.8h // ...........................................*..................................................|..........................................*................................................ + // add v9.8h, v9.8h, v13.8h // ............................................*.................................................|...........................................*............................................... + // mul v13.8h, v24.8h, v0.h[0] // ..........................................................................*...................|.........................................................................*................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ......................................................................*.......................|.....................................................................*..................... + // mls v13.8h, v24.8h, v7.h[0] // ...................................................................................*..........|..................................................................................*........ + // sub v24.8h, v10.8h, v14.8h // ..................................*...........................................................|.................................*......................................................... + // add v10.8h, v10.8h, v14.8h // .................................*............................................................|................................*.......................................................... + // mul v14.8h, v24.8h, v0.h[0] // ..........................................*...................................................|.........................................*................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................................*......................................................|......................................*.................................................... + // mls v14.8h, v24.8h, v7.h[0] // ...............................................................................*..............|..............................................................................*............ + // sub v24.8h, v11.8h, v15.8h // ....................................................*.........................................|...................................................*....................................... + // add v11.8h, v11.8h, v15.8h // .......................................................*......................................|......................................................*.................................... + // mul v15.8h, v24.8h, v0.h[0] // ...............................................................*..............................|..............................................................*............................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..................................................................*...........................|.................................................................*......................... + // mls v15.8h, v24.8h, v7.h[0] // .........................................................................*....................|........................................................................*.................. + // str q12, [x0, #(4*(512/8))] // ..................................................................................*...........|.................................................................................*......... + // str q13, [x0, #(5*(512/8))] // ...........................................................................................*..|..........................................................................................* + // str q14, [x0, #(6*(512/8))] // .........................................................................................*....|........................................................................................*.. + // str q15, [x0, #(7*(512/8))] // .................................................................................*............|................................................................................*.......... + // mul v12.8h, v8.8h, v29.8h // .............................*................................................................|............................*.............................................................. + // sqrdmulh v8.8h, v8.8h, v30.8h // ...........................*..................................................................|..........................*................................................................ + // mls v12.8h, v8.8h, v7.h[0] // ...................................*..........................................................|..................................*........................................................ + // mul v13.8h, v9.8h, v29.8h // ...................................................*..........................................|..................................................*........................................ + // sqrdmulh v9.8h, v9.8h, v30.8h // .................................................*............................................|................................................*.......................................... + // mls v13.8h, v9.8h, v7.h[0] // ........................................................*.....................................|.......................................................*................................... + // mul v14.8h, v10.8h, v29.8h // ......................................................*.......................................|.....................................................*..................................... + // sqrdmulh v10.8h, v10.8h, v30.8h // .....................................*........................................................|....................................*...................................................... + // mls v14.8h, v10.8h, v7.h[0] // .............................................................................*................|............................................................................*.............. + // mul v15.8h, v11.8h, v29.8h // .............................................................*................................|............................................................*.............................. + // sqrdmulh v11.8h, v11.8h, v30.8h // ...........................................................*..................................|..........................................................*................................ + // mls v15.8h, v11.8h, v7.h[0] // ....................................................................*.........................|...................................................................*....................... + // str q12, [x0], #(16) // .................................................................*............................|................................................................*.......................... + // str q13, [x0, #(-16 + 1*(512/8))] // ..............................................................*...............................|.............................................................*............................. + // str q14, [x0, #(-16 + 2*(512/8))] // .......................................................................................*......|......................................................................................*.... + // str q15, [x0, #(-16 + 3*(512/8))] // ............................................................................*.................|...........................................................................*............... + + sub count, count, #1 + cbnz count, layer123_start + mls v3.8H, v23.8H, v7.H[0] // *............................................................... + sub v2.8H, v11.8H, v15.8H // .*.............................................................. + // gap // ................................................................ + // gap // ................................................................ + mls v9.8H, v19.8H, v7.H[0] // ..*............................................................. + srshr v23.8H, v24.8H, #11 // ...*............................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v26.8H, v2.8H, v1.H[4] // ....*........................................................... + sqrdmulh v2.8H, v2.8H, v1.H[5] // .....*.......................................................... + // gap // ................................................................ + // gap // ................................................................ + sub v21.8H, v21.8H, v28.8H // ......*......................................................... + mls v10.8H, v20.8H, v7.H[0] // .......*........................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v8.8H, v23.8H, v7.H[0] // ........*....................................................... + mls v13.8H, v16.8H, v7.H[0] // .........*...................................................... + // gap // ................................................................ + // gap // ................................................................ + mls v26.8H, v2.8H, v7.H[0] // ..........*..................................................... + mul v2.8H, v21.8H, v0.H[4] // ...........*.................................................... + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v16.8H, v21.8H, v0.H[5] // ............*................................................... + sub v23.8H, v9.8H, v10.8H // .............*.................................................. + // gap // ................................................................ + // gap // ................................................................ + add v21.8H, v9.8H, v10.8H // ..............*................................................. + add v20.8H, v8.8H, v13.8H // ...............*................................................ + // gap // ................................................................ + // gap // ................................................................ + sub v17.8H, v31.8H, v26.8H // ....................*........................................... + add v26.8H, v31.8H, v26.8H // .................*.............................................. + // gap // ................................................................ + // gap // ................................................................ + mls v2.8H, v16.8H, v7.H[0] // ..................*............................................. + sub v16.8H, v8.8H, v13.8H // ................*............................................... + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v4.8H, v20.8H, v30.8H // ...................*............................................ + mul v20.8H, v20.8H, v29.8H // .....................*.......................................... + // gap // ................................................................ + // gap // ................................................................ + mul v25.8H, v16.8H, v0.H[0] // .......................*........................................ + mul v11.8H, v23.8H, v0.H[2] // ......................*......................................... + // gap // ................................................................ + // gap // ................................................................ + sub v13.8H, v3.8H, v2.8H // ..........................*..................................... + add v2.8H, v3.8H, v2.8H // .........................*...................................... + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v16.8H, v16.8H, v0.H[1] // ........................*....................................... + mls v20.8H, v4.8H, v7.H[0] // ...........................*.................................... + // gap // ................................................................ + // gap // ................................................................ + mul v4.8H, v17.8H, v0.H[4] // ............................*................................... + sqrdmulh v3.8H, v2.8H, v30.8H // .............................*.................................. + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v17.8H, v17.8H, v0.H[5] // ..............................*................................. + sqrdmulh v10.8H, v13.8H, v0.H[1] // ...............................*................................ + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v23.8H, v23.8H, v0.H[3] // .................................*.............................. + mls v25.8H, v16.8H, v7.H[0] // ................................*............................... + str q20, [x0], #(16) // .................................................*.............. + // gap // ................................................................ + mul v16.8H, v13.8H, v0.H[0] // ..................................*............................. + sub v20.8H, v21.8H, v26.8H // ...................................*............................ + // gap // ................................................................ + // gap // ................................................................ + add v21.8H, v21.8H, v26.8H // ....................................*........................... + mls v4.8H, v17.8H, v7.H[0] // .....................................*.......................... + // gap // ................................................................ + // gap // ................................................................ + mls v11.8H, v23.8H, v7.H[0] // ......................................*......................... + mul v2.8H, v2.8H, v29.8H // ..........................................*..................... + str q25, [x0, #240] // ...........................................................*.... + // gap // ................................................................ + sqrdmulh v23.8H, v21.8H, v30.8H // .......................................*........................ + mul v21.8H, v21.8H, v29.8H // ........................................*....................... + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v26.8H, v20.8H, v0.H[1] // ....................................................*........... + mul v20.8H, v20.8H, v0.H[0] // ......................................................*......... + // gap // ................................................................ + // gap // ................................................................ + add v17.8H, v11.8H, v4.8H // ...........................................*.................... + sub v4.8H, v11.8H, v4.8H // .........................................*...................... + // gap // ................................................................ + // gap // ................................................................ + mls v2.8H, v3.8H, v7.H[0] // ........................................................*....... + mls v21.8H, v23.8H, v7.H[0] // ............................................*................... + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v23.8H, v17.8H, v30.8H // .............................................*.................. + mul v17.8H, v17.8H, v29.8H // ..............................................*................. + // gap // ................................................................ + // gap // ................................................................ + mul v25.8H, v4.8H, v0.H[0] // ................................................*............... + sqrdmulh v4.8H, v4.8H, v0.H[1] // ..................................................*............. + // gap // ................................................................ + // gap // ................................................................ + mls v16.8H, v10.8H, v7.H[0] // .........................................................*...... + mls v20.8H, v26.8H, v7.H[0] // ............................................................*... + str q21, [x0, #48] // ...............................................*................ + // gap // ................................................................ + str q2, [x0, #112] // .............................................................*.. + mls v17.8H, v23.8H, v7.H[0] // ...................................................*............ + // gap // ................................................................ + // gap // ................................................................ + mls v25.8H, v4.8H, v7.H[0] // .....................................................*.......... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + str q16, [x0, #368] // ..............................................................*. + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + str q17, [x0, #176] // .......................................................*........ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + str q25, [x0, #432] // ..........................................................*..... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + str q20, [x0, #304] // ...............................................................* + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + + // original source code + // mls v3.8H, v23.8H, v7.H[0] // *............................................................... + // sub v5.8H, v11.8H, v15.8H // .*.............................................................. + // mls v9.8H, v19.8H, v7.H[0] // ..*............................................................. + // srshr v26.8H, v24.8H, #11 // ...*............................................................ + // mul v6.8H, v5.8H, v1.H[4] // ....*........................................................... + // sqrdmulh v2.8H, v5.8H, v1.H[5] // .....*.......................................................... + // sub v24.8H, v21.8H, v28.8H // ......*......................................................... + // mls v10.8H, v20.8H, v7.H[0] // .......*........................................................ + // mls v8.8H, v26.8H, v7.H[0] // ........*....................................................... + // mls v13.8H, v16.8H, v7.H[0] // .........*...................................................... + // mls v6.8H, v2.8H, v7.H[0] // ..........*..................................................... + // mul v16.8H, v24.8H, v0.H[4] // ...........*.................................................... + // sqrdmulh v19.8H, v24.8H, v0.H[5] // ............*................................................... + // sub v28.8H, v9.8H, v10.8H // .............*.................................................. + // add v20.8H, v9.8H, v10.8H // ..............*................................................. + // add v18.8H, v8.8H, v13.8H // ...............*................................................ + // sub v2.8H, v8.8H, v13.8H // ...................*............................................ + // add v26.8H, v31.8H, v6.8H // .................*.............................................. + // mls v16.8H, v19.8H, v7.H[0] // ..................*............................................. + // sqrdmulh v13.8H, v18.8H, v30.8H // ....................*........................................... + // sub v8.8H, v31.8H, v6.8H // ................*............................................... + // mul v31.8H, v18.8H, v29.8H // .....................*.......................................... + // mul v24.8H, v28.8H, v0.H[2] // .......................*........................................ + // mul v22.8H, v2.8H, v0.H[0] // ......................*......................................... + // sqrdmulh v27.8H, v2.8H, v0.H[1] // ..........................*..................................... + // add v2.8H, v3.8H, v16.8H // .........................*...................................... + // sub v25.8H, v3.8H, v16.8H // ........................*....................................... + // mls v31.8H, v13.8H, v7.H[0] // ...........................*.................................... + // mul v13.8H, v8.8H, v0.H[4] // ............................*................................... + // sqrdmulh v23.8H, v2.8H, v30.8H // .............................*.................................. + // sqrdmulh v8.8H, v8.8H, v0.H[5] // ..............................*................................. + // sqrdmulh v3.8H, v25.8H, v0.H[1] // ...............................*................................ + // mls v22.8H, v27.8H, v7.H[0] // .................................*.............................. + // sqrdmulh v19.8H, v28.8H, v0.H[3] // ................................*............................... + // mul v6.8H, v25.8H, v0.H[0] // ...................................*............................ + // sub v25.8H, v20.8H, v26.8H // ....................................*........................... + // add v26.8H, v20.8H, v26.8H // .....................................*.......................... + // mls v13.8H, v8.8H, v7.H[0] // ......................................*......................... + // mls v24.8H, v19.8H, v7.H[0] // .......................................*........................ + // sqrdmulh v16.8H, v26.8H, v30.8H // ..........................................*..................... + // mul v8.8H, v26.8H, v29.8H // ...........................................*.................... + // sub v19.8H, v24.8H, v13.8H // ...............................................*................ + // mul v2.8H, v2.8H, v29.8H // ........................................*....................... + // add v24.8H, v24.8H, v13.8H // ..............................................*................. + // mls v8.8H, v16.8H, v7.H[0] // .................................................*.............. + // sqrdmulh v20.8H, v24.8H, v30.8H // ..................................................*............. + // mul v24.8H, v24.8H, v29.8H // ...................................................*............ + // str q8, [x0, #64] // ........................................................*....... + // mul v14.8H, v19.8H, v0.H[0] // ....................................................*........... + // str q31, [x0], #(16) // ..................................*............................. + // sqrdmulh v16.8H, v19.8H, v0.H[1] // .....................................................*.......... + // mls v24.8H, v20.8H, v7.H[0] // ..........................................................*..... + // sqrdmulh v20.8H, v25.8H, v0.H[1] // ............................................*................... + // mls v14.8H, v16.8H, v7.H[0] // ...........................................................*.... + // mul v26.8H, v25.8H, v0.H[0] // .............................................*.................. + // str q24, [x0, #176] // .............................................................*.. + // mls v2.8H, v23.8H, v7.H[0] // ................................................*............... + // mls v6.8H, v3.8H, v7.H[0] // ......................................................*......... + // str q14, [x0, #432] // ..............................................................*. + // str q22, [x0, #240] // .........................................*...................... + // mls v26.8H, v20.8H, v7.H[0] // .......................................................*........ + // str q2, [x0, #112] // .........................................................*...... + // str q6, [x0, #368] // ............................................................*... + // str q26, [x0, #304] // ...............................................................* + + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_a55.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_a55.s new file mode 100644 index 0000000..3552189 --- /dev/null +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_a55.s @@ -0,0 +1,1516 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_opt_a55 + .global _intt_kyber_123_4567_opt_a55 + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_opt_a55: +_intt_kyber_123_4567_opt_a55: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 + ldr q8, [x1, #0] // *.......... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q26, [x1, #16] // .*......... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q28, [x1, #32] // ..*........ + // gap // ........... + // gap // ........... + // gap // ........... + ldr q18, [x1, #48] // ...*....... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q5, [x4], #(6*16) // .....*..... + // gap // ........... + // gap // ........... + // gap // ........... + trn1 v19.4S, v28.4S, v18.4S // ....*...... + // gap // ........... + ldr q9, [x4, #-80] // ......*.... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q17, [x4, #-64] // .......*... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q4, [x4, #-48] // ........*.. + // gap // ........... + // gap // ........... + // gap // ........... + ldr q21, [x4, #-32] // .........*. + // gap // ........... + // gap // ........... + // gap // ........... + ldr q1, [x4, #-16] // ..........* + // gap // ........... + + // original source code + // ldr q8, [x1, #0] // *.......... + // ldr q26, [x1, #16] // .*......... + // ldr q28, [x1, #32] // ..*........ + // ldr q18, [x1, #48] // ...*....... + // trn1 v19.4S, v28.4S, v18.4S // .....*..... + // ldr q5, [x4], #(6*16) // ....*...... + // ldr q9, [x4, #-80] // ......*.... + // ldr q17, [x4, #-64] // .......*... + // ldr q4, [x4, #-48] // ........*.. + // ldr q21, [x4, #-32] // .........*. + // ldr q1, [x4, #-16] // ..........* + + sub count, count, #1 +layer4567_start: + trn1 v23.4S, v8.4S, v26.4S // ....*........................................................................ + // gap // ............................................................................. + trn2 v8.4S, v8.4S, v26.4S // .....*....................................................................... + // gap // ............................................................................. + trn2 v12.4S, v28.4S, v18.4S // .......*..................................................................... + // gap // ............................................................................. + trn2 v18.2D, v23.2D, v19.2D // ........*.................................................................... + // gap // ............................................................................. + trn1 v23.2D, v23.2D, v19.2D // ..........*.................................................................. + // gap // ............................................................................. + trn2 v26.2D, v8.2D, v12.2D // .........*................................................................... + // gap // ............................................................................. + trn1 v8.2D, v8.2D, v12.2D // ...........*................................................................. + // gap // ............................................................................. + sub v12.8H, v18.8H, v26.8H // .......................*..................................................... + // gap // ............................................................................. + add v18.8H, v18.8H, v26.8H // ........................*.................................................... + // gap // ............................................................................. + sub v26.8H, v23.8H, v8.8H // ..................*.......................................................... + // gap // ............................................................................. + add v23.8H, v23.8H, v8.8H // ...................*......................................................... + // gap // ............................................................................. + mul v8.8H, v12.8H, v21.8H // .........................*................................................... + // gap // ............................................................................. + mul v17.8H, v26.8H, v17.8H // ....................*........................................................ + // gap // ............................................................................. + sqrdmulh v26.8H, v26.8H, v4.8H // .....................*....................................................... + // gap // ............................................................................. + sqrdmulh v12.8H, v12.8H, v1.8H // ..........................*.................................................. + // gap // ............................................................................. + sub v28.8H, v23.8H, v18.8H // ............................*................................................ + // gap // ............................................................................. + add v23.8H, v23.8H, v18.8H // .............................*............................................... + // gap // ............................................................................. + mls v17.8H, v26.8H, v7.H[0] // ......................*...................................................... + // gap // ............................................................................. + mls v8.8H, v12.8H, v7.H[0] // ...........................*................................................. + // gap // ............................................................................. + mul v12.8H, v28.8H, v5.8H // ..............................*.............................................. + // gap // ............................................................................. + sqrdmulh v18.8H, v28.8H, v9.8H // ...............................*............................................. + // gap // ............................................................................. + ldr q4, [x3], #16 // ..............................................*.............................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + sub v26.8H, v17.8H, v8.8H // .................................*........................................... + // gap // ............................................................................. + mls v12.8H, v18.8H, v7.H[0] // ................................*............................................ + // gap // ............................................................................. + add v8.8H, v17.8H, v8.8H // ..................................*.......................................... + // gap // ............................................................................. + mul v18.8H, v26.8H, v5.8H // ...................................*......................................... + // gap // ............................................................................. + sqrdmulh v26.8H, v26.8H, v9.8H // ....................................*........................................ + // gap // ............................................................................. + trn1 v17.4S, v23.4S, v8.4S // ......................................*...................................... + // gap // ............................................................................. + trn2 v23.4S, v23.4S, v8.4S // .......................................*..................................... + // gap // ............................................................................. + ldr q8, [x1, #64] // e............................................................................ + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + mls v18.8H, v26.8H, v7.H[0] // .....................................*....................................... + // gap // ............................................................................. + ldr q26, [x1, #80] // .e........................................................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + ldr q28, [x1, #96] // ..e.......................................................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + trn1 v19.4S, v12.4S, v18.4S // ........................................*.................................... + // gap // ............................................................................. + trn2 v12.4S, v12.4S, v18.4S // .........................................*................................... + // gap // ............................................................................. + ldr q18, [x1, #112] // ...e......................................................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + trn2 v5.2D, v17.2D, v19.2D // ..........................................*.................................. + // gap // ............................................................................. + trn2 v9.2D, v23.2D, v12.2D // ...........................................*................................. + // gap // ............................................................................. + trn1 v17.2D, v17.2D, v19.2D // ............................................*................................ + // gap // ............................................................................. + trn1 v23.2D, v23.2D, v12.2D // .............................................*............................... + // gap // ............................................................................. + sub v12.8H, v5.8H, v9.8H // ....................................................*........................ + // gap // ............................................................................. + sub v19.8H, v17.8H, v23.8H // ...............................................*............................. + // gap // ............................................................................. + add v23.8H, v17.8H, v23.8H // ................................................*............................ + // gap // ............................................................................. + add v17.8H, v5.8H, v9.8H // .....................................................*....................... + // gap // ............................................................................. + mul v5.8H, v19.8H, v4.H[2] // .................................................*........................... + // gap // ............................................................................. + sqrdmulh v19.8H, v19.8H, v4.H[3] // ..................................................*.......................... + // gap // ............................................................................. + mul v9.8H, v12.8H, v4.H[4] // ......................................................*...................... + // gap // ............................................................................. + sqdmulh v21.8H, v23.8H, v7.H[1] // .........................................................*................... + // gap // ............................................................................. + sqdmulh v1.8H, v17.8H, v7.H[1] // ............................................................*................ + // gap // ............................................................................. + sqrdmulh v12.8H, v12.8H, v4.H[5] // .......................................................*..................... + // gap // ............................................................................. + mls v5.8H, v19.8H, v7.H[0] // ...................................................*......................... + // gap // ............................................................................. + srshr v19.8H, v21.8H, #11 // ..........................................................*.................. + // gap // ............................................................................. + srshr v21.8H, v1.8H, #11 // .............................................................*............... + // gap // ............................................................................. + mls v9.8H, v12.8H, v7.H[0] // ........................................................*.................... + // gap // ............................................................................. + mls v23.8H, v19.8H, v7.H[0] // ...........................................................*................. + // gap // ............................................................................. + mls v17.8H, v21.8H, v7.H[0] // ..............................................................*.............. + // gap // ............................................................................. + trn1 v19.4S, v28.4S, v18.4S // ......e...................................................................... + // gap // ............................................................................. + sub v12.8H, v5.8H, v9.8H // ....................................................................*........ + // gap // ............................................................................. + add v5.8H, v5.8H, v9.8H // .....................................................................*....... + // gap // ............................................................................. + sub v9.8H, v23.8H, v17.8H // ...............................................................*............. + // gap // ............................................................................. + mul v21.8H, v12.8H, v4.H[0] // ......................................................................*...... + // gap // ............................................................................. + sqrdmulh v12.8H, v12.8H, v4.H[1] // .......................................................................*..... + // gap // ............................................................................. + mul v1.8H, v9.8H, v4.H[0] // .................................................................*........... + // gap // ............................................................................. + sqrdmulh v4.8H, v9.8H, v4.H[1] // ..................................................................*.......... + // gap // ............................................................................. + add v23.8H, v23.8H, v17.8H // ................................................................*............ + // gap // ............................................................................. + mls v21.8H, v12.8H, v7.H[0] // ........................................................................*.... + // gap // ............................................................................. + str q5, [x1, #16] // ..........................................................................*.. + // gap // ............................................................................. + mls v1.8H, v4.8H, v7.H[0] // ...................................................................*......... + // gap // ............................................................................. + str q23, [x1], #(64) // .........................................................................*... + // gap // ............................................................................. + ldr q5, [x4], #(6*16) // ............e................................................................ + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + str q1, [x1, #-32] // ...........................................................................*. + // gap // ............................................................................. + ldr q9, [x4, #-80] // .............e............................................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + str q21, [x1, #-16] // ............................................................................* + // gap // ............................................................................. + ldr q17, [x4, #-64] // ..............e.............................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + ldr q4, [x4, #-48] // ...............e............................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + ldr q21, [x4, #-32] // ................e............................................................ + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + ldr q1, [x4, #-16] // .................e........................................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + + // original source code + // ldr q8, [x1, #(16*0)] // e...............................................|............................e........................................... + // ldr q9, [x1, #(16*1)] // ..e.............................................|..............................e......................................... + // ldr q10, [x1, #(16*2)] // ...e............................................|...............................e........................................ + // ldr q11, [x1, #(16*3)] // ......e.........................................|..................................e..................................... + // trn1 v25.4s, v8.4s, v9.4s // ................................................*........................................................................ + // trn2 v26.4s, v8.4s, v9.4s // ................................................|*....................................................................... + // trn1 v27.4s, v10.4s, v11.4s // ...........................e....................|.......................................................e................ + // trn2 v28.4s, v10.4s, v11.4s // ................................................|.*...................................................................... + // trn2 v10.2d, v25.2d, v27.2d // ................................................|..*..................................................................... + // trn2 v11.2d, v26.2d, v28.2d // ................................................|....*................................................................... + // trn1 v8.2d, v25.2d, v27.2d // ................................................|...*.................................................................... + // trn1 v9.2d, v26.2d, v28.2d // ................................................|.....*.................................................................. + // ldr q0, [x4], #(6*16) // ........................................e.......|....................................................................e... + // ldr q4, [x4, #(-6*16 + 1*16)] // ..........................................e.....|......................................................................e. + // ldr q1, [x4, #(-6*16 + 2*16)] // ............................................e...|........................................................................ + // ldr q5, [x4, #(-6*16 + 3*16)] // .............................................e..|........................................................................ + // ldr q2, [x4, #(-6*16 + 4*16)] // ..............................................e.|........................................................................ + // ldr q6, [x4, #(-6*16 + 5*16)] // ...............................................e|........................................................................ + // sub v24.8h, v8.8h, v9.8h // ................................................|........*............................................................... + // add v8.8h, v8.8h, v9.8h // ................................................|.........*.............................................................. + // mul v9.8h, v24.8h, v1.8h // ................................................|...........*............................................................ + // sqrdmulh v24.8h, v24.8h, v5.8h // ................................................|............*........................................................... + // mls v9.8h, v24.8h, v7.h[0] // ................................................|................*....................................................... + // sub v24.8h, v10.8h, v11.8h // ................................................|......*................................................................. + // add v10.8h, v10.8h, v11.8h // ................................................|.......*................................................................ + // mul v11.8h, v24.8h, v2.8h // ................................................|..........*............................................................. + // sqrdmulh v24.8h, v24.8h, v6.8h // ................................................|.............*.......................................................... + // mls v11.8h, v24.8h, v7.h[0] // ................................................|.................*...................................................... + // sub v24.8h, v8.8h, v10.8h // ................................................|..............*......................................................... + // add v8.8h, v8.8h, v10.8h // ................................................|...............*........................................................ + // mul v10.8h, v24.8h, v0.8h // ................................................|..................*..................................................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ................................................|...................*.................................................... + // mls v10.8h, v24.8h, v7.h[0] // ................................................|......................*................................................. + // sub v24.8h, v9.8h, v11.8h // ................................................|.....................*.................................................. + // add v9.8h, v9.8h, v11.8h // ................................................|.......................*................................................ + // mul v11.8h, v24.8h, v0.8h // ................................................|........................*............................................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ................................................|.........................*.............................................. + // mls v11.8h, v24.8h, v7.h[0] // .*..............................................|.............................*.......................................... + // trn1 v25.4s, v8.4s, v9.4s // ................................................|..........................*............................................. + // trn2 v26.4s, v8.4s, v9.4s // ................................................|...........................*............................................ + // trn1 v27.4s, v10.4s, v11.4s // ....*...........................................|................................*....................................... + // trn2 v28.4s, v10.4s, v11.4s // .....*..........................................|.................................*...................................... + // trn2 v10.2d, v25.2d, v27.2d // .......*........................................|...................................*.................................... + // trn2 v11.2d, v26.2d, v28.2d // ........*.......................................|....................................*................................... + // trn1 v8.2d, v25.2d, v27.2d // .........*......................................|.....................................*.................................. + // trn1 v9.2d, v26.2d, v28.2d // ..........*.....................................|......................................*................................. + // ldr q0, [x3], #16 // ................................................|....................*................................................... + // sub v24.8h, v8.8h, v9.8h // ............*...................................|........................................*............................... + // add v8.8h, v8.8h, v9.8h // .............*..................................|.........................................*.............................. + // mul v9.8h, v24.8h, v0.h[2] // ...............*................................|...........................................*............................ + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ................*...............................|............................................*........................... + // mls v9.8h, v24.8h, v7.h[0] // .....................*..........................|.................................................*...................... + // sub v24.8h, v10.8h, v11.8h // ...........*....................................|.......................................*................................ + // add v10.8h, v10.8h, v11.8h // ..............*.................................|..........................................*............................. + // mul v11.8h, v24.8h, v0.h[4] // .................*..............................|.............................................*.......................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ....................*...........................|................................................*....................... + // mls v11.8h, v24.8h, v7.h[0] // ........................*.......................|....................................................*................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ..................*.............................|..............................................*......................... + // srshr v25.8h, v25.8h, #11 // ......................*.........................|..................................................*..................... + // mls v8.8h, v25.8h, v7.h[0] // .........................*......................|.....................................................*.................. + // sqdmulh v25.8h, v10.8h, v7.h[1] // ...................*............................|...............................................*........................ + // srshr v25.8h, v25.8h, #11 // .......................*........................|...................................................*.................... + // mls v10.8h, v25.8h, v7.h[0] // ..........................*.....................|......................................................*................. + // sub v24.8h, v8.8h, v10.8h // ..............................*.................|..........................................................*............. + // add v8.8h, v8.8h, v10.8h // ...................................*............|...............................................................*........ + // mul v10.8h, v24.8h, v0.h[0] // .................................*..............|.............................................................*.......... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..................................*.............|..............................................................*......... + // mls v10.8h, v24.8h, v7.h[0] // ......................................*.........|..................................................................*..... + // sub v24.8h, v9.8h, v11.8h // ............................*...................|........................................................*............... + // add v9.8h, v9.8h, v11.8h // .............................*..................|.........................................................*.............. + // mul v11.8h, v24.8h, v0.h[0] // ...............................*................|...........................................................*............ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ................................*...............|............................................................*........... + // mls v11.8h, v24.8h, v7.h[0] // ....................................*...........|................................................................*....... + // str q8, [x1], #(64) // .......................................*........|...................................................................*.... + // str q9, [x1, #(-64 + 16*1)] // .....................................*..........|.................................................................*...... + // str q10, [x1, #(-64 + 16*2)] // .........................................*......|.....................................................................*.. + // str q11, [x1, #(-64 + 16*3)] // ...........................................*....|.......................................................................* + + sub count, count, #1 + cbnz count, layer4567_start + trn1 v3.4S, v8.4S, v26.4S // *................................................................. + // gap // .................................................................. + trn2 v26.4S, v8.4S, v26.4S // .*................................................................ + // gap // .................................................................. + trn2 v8.4S, v28.4S, v18.4S // ..*............................................................... + // gap // .................................................................. + trn2 v29.2D, v3.2D, v19.2D // ...*.............................................................. + // gap // .................................................................. + trn1 v18.2D, v3.2D, v19.2D // ....*............................................................. + // gap // .................................................................. + trn2 v23.2D, v26.2D, v8.2D // .....*............................................................ + // gap // .................................................................. + trn1 v28.2D, v26.2D, v8.2D // ......*........................................................... + // gap // .................................................................. + sub v8.8H, v29.8H, v23.8H // .......*.......................................................... + // gap // .................................................................. + add v19.8H, v29.8H, v23.8H // ........*......................................................... + // gap // .................................................................. + sub v12.8H, v18.8H, v28.8H // .........*........................................................ + // gap // .................................................................. + sqrdmulh v23.8H, v8.8H, v1.8H // ..............*................................................... + // gap // .................................................................. + mul v21.8H, v8.8H, v21.8H // ...........*...................................................... + // gap // .................................................................. + mul v26.8H, v12.8H, v17.8H // ............*..................................................... + // gap // .................................................................. + sqrdmulh v12.8H, v12.8H, v4.8H // .............*.................................................... + // gap // .................................................................. + add v17.8H, v18.8H, v28.8H // ..........*....................................................... + // gap // .................................................................. + mls v21.8H, v23.8H, v7.H[0] // ..................*............................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mls v26.8H, v12.8H, v7.H[0] // .................*................................................ + // gap // .................................................................. + sub v8.8H, v17.8H, v19.8H // ...............*.................................................. + // gap // .................................................................. + ldr q1, [x3], #16 // .....................*............................................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sub v23.8H, v26.8H, v21.8H // ......................*........................................... + // gap // .................................................................. + sqrdmulh v18.8H, v8.8H, v9.8H // ....................*............................................. + // gap // .................................................................. + mul v28.8H, v8.8H, v5.8H // ...................*.............................................. + // gap // .................................................................. + mul v8.8H, v23.8H, v5.8H // .........................*........................................ + // gap // .................................................................. + sqrdmulh v23.8H, v23.8H, v9.8H // ..........................*....................................... + // gap // .................................................................. + add v12.8H, v26.8H, v21.8H // ........................*......................................... + // gap // .................................................................. + add v9.8H, v17.8H, v19.8H // ................*................................................. + // gap // .................................................................. + mls v28.8H, v18.8H, v7.H[0] // .......................*.......................................... + // gap // .................................................................. + mls v8.8H, v23.8H, v7.H[0] // .............................*.................................... + // gap // .................................................................. + trn1 v17.4S, v9.4S, v12.4S // ...........................*...................................... + // gap // .................................................................. + trn2 v26.4S, v9.4S, v12.4S // ............................*..................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn2 v12.4S, v28.4S, v8.4S // ...............................*.................................. + // gap // .................................................................. + trn1 v23.4S, v28.4S, v8.4S // ..............................*................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn2 v8.2D, v26.2D, v12.2D // .................................*................................ + // gap // .................................................................. + trn1 v12.2D, v26.2D, v12.2D // ...................................*.............................. + // gap // .................................................................. + trn1 v29.2D, v17.2D, v23.2D // ..................................*............................... + // gap // .................................................................. + trn2 v18.2D, v17.2D, v23.2D // ................................*................................. + // gap // .................................................................. + sub v17.8H, v29.8H, v12.8H // .....................................*............................ + // gap // .................................................................. + sub v23.8H, v18.8H, v8.8H // ....................................*............................. + // gap // .................................................................. + add v28.8H, v18.8H, v8.8H // .......................................*.......................... + // gap // .................................................................. + sqrdmulh v18.8H, v17.8H, v1.H[3] // .........................................*........................ + // gap // .................................................................. + add v4.8H, v29.8H, v12.8H // ......................................*........................... + // gap // .................................................................. + mul v26.8H, v23.8H, v1.H[4] // ..........................................*....................... + // gap // .................................................................. + sqrdmulh v23.8H, v23.8H, v1.H[5] // .............................................*.................... + // gap // .................................................................. + sqdmulh v12.8H, v4.8H, v7.H[1] // ...........................................*...................... + // gap // .................................................................. + sqdmulh v8.8H, v28.8H, v7.H[1] // ............................................*..................... + // gap // .................................................................. + mul v21.8H, v17.8H, v1.H[2] // ........................................*......................... + // gap // .................................................................. + mls v26.8H, v23.8H, v7.H[0] // .................................................*................ + // gap // .................................................................. + srshr v12.8H, v12.8H, #11 // ...............................................*.................. + // gap // .................................................................. + srshr v23.8H, v8.8H, #11 // ................................................*................. + // gap // .................................................................. + mls v21.8H, v18.8H, v7.H[0] // ..............................................*................... + // gap // .................................................................. + mls v4.8H, v12.8H, v7.H[0] // ..................................................*............... + // gap // .................................................................. + mls v28.8H, v23.8H, v7.H[0] // ...................................................*.............. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sub v18.8H, v21.8H, v26.8H // ....................................................*............. + // gap // .................................................................. + add v17.8H, v21.8H, v26.8H // .....................................................*............ + // gap // .................................................................. + sub v12.8H, v4.8H, v28.8H // ......................................................*........... + // gap // .................................................................. + mul v26.8H, v18.8H, v1.H[0] // .......................................................*.......... + // gap // .................................................................. + sqrdmulh v18.8H, v18.8H, v1.H[1] // ........................................................*......... + // gap // .................................................................. + sqrdmulh v8.8H, v12.8H, v1.H[1] // ..........................................................*....... + // gap // .................................................................. + add v23.8H, v4.8H, v28.8H // ...........................................................*...... + // gap // .................................................................. + mul v12.8H, v12.8H, v1.H[0] // .........................................................*........ + // gap // .................................................................. + str q17, [x1, #16] // .............................................................*.... + // gap // .................................................................. + mls v26.8H, v18.8H, v7.H[0] // ............................................................*..... + // gap // .................................................................. + str q23, [x1], #(64) // ...............................................................*.. + // gap // .................................................................. + mls v12.8H, v8.8H, v7.H[0] // ..............................................................*... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + str q26, [x1, #-16] // .................................................................* + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + str q12, [x1, #-32] // ................................................................*. + // gap // .................................................................. + + // original source code + // trn1 v23.4S, v8.4S, v26.4S // *................................................................. + // trn2 v8.4S, v8.4S, v26.4S // .*................................................................ + // trn2 v12.4S, v28.4S, v18.4S // ..*............................................................... + // trn2 v18.2D, v23.2D, v19.2D // ...*.............................................................. + // trn1 v23.2D, v23.2D, v19.2D // ....*............................................................. + // trn2 v26.2D, v8.2D, v12.2D // .....*............................................................ + // trn1 v8.2D, v8.2D, v12.2D // ......*........................................................... + // sub v12.8H, v18.8H, v26.8H // .......*.......................................................... + // add v18.8H, v18.8H, v26.8H // ........*......................................................... + // sub v26.8H, v23.8H, v8.8H // .........*........................................................ + // add v23.8H, v23.8H, v8.8H // ..............*................................................... + // mul v8.8H, v12.8H, v21.8H // ...........*...................................................... + // mul v17.8H, v26.8H, v17.8H // ............*..................................................... + // sqrdmulh v26.8H, v26.8H, v4.8H // .............*.................................................... + // sqrdmulh v12.8H, v12.8H, v1.8H // ..........*....................................................... + // sub v28.8H, v23.8H, v18.8H // .................*................................................ + // add v23.8H, v23.8H, v18.8H // .........................*........................................ + // mls v17.8H, v26.8H, v7.H[0] // ................*................................................. + // mls v8.8H, v12.8H, v7.H[0] // ...............*.................................................. + // mul v12.8H, v28.8H, v5.8H // .....................*............................................ + // sqrdmulh v18.8H, v28.8H, v9.8H // ....................*............................................. + // ldr q4, [x3], #16 // ..................*............................................... + // sub v26.8H, v17.8H, v8.8H // ...................*.............................................. + // mls v12.8H, v18.8H, v7.H[0] // ..........................*....................................... + // add v8.8H, v17.8H, v8.8H // ........................*......................................... + // mul v18.8H, v26.8H, v5.8H // ......................*........................................... + // sqrdmulh v26.8H, v26.8H, v9.8H // .......................*.......................................... + // trn1 v17.4S, v23.4S, v8.4S // ............................*..................................... + // trn2 v23.4S, v23.4S, v8.4S // .............................*.................................... + // mls v18.8H, v26.8H, v7.H[0] // ...........................*...................................... + // trn1 v19.4S, v12.4S, v18.4S // ...............................*.................................. + // trn2 v12.4S, v12.4S, v18.4S // ..............................*................................... + // trn2 v5.2D, v17.2D, v19.2D // ...................................*.............................. + // trn2 v9.2D, v23.2D, v12.2D // ................................*................................. + // trn1 v17.2D, v17.2D, v19.2D // ..................................*............................... + // trn1 v23.2D, v23.2D, v12.2D // .................................*................................ + // sub v12.8H, v5.8H, v9.8H // .....................................*............................ + // sub v19.8H, v17.8H, v23.8H // ....................................*............................. + // add v23.8H, v17.8H, v23.8H // ........................................*......................... + // add v17.8H, v5.8H, v9.8H // ......................................*........................... + // mul v5.8H, v19.8H, v4.H[2] // .............................................*.................... + // sqrdmulh v19.8H, v19.8H, v4.H[3] // .......................................*.......................... + // mul v9.8H, v12.8H, v4.H[4] // .........................................*........................ + // sqdmulh v21.8H, v23.8H, v7.H[1] // ...........................................*...................... + // sqdmulh v1.8H, v17.8H, v7.H[1] // ............................................*..................... + // sqrdmulh v12.8H, v12.8H, v4.H[5] // ..........................................*....................... + // mls v5.8H, v19.8H, v7.H[0] // .................................................*................ + // srshr v19.8H, v21.8H, #11 // ...............................................*.................. + // srshr v21.8H, v1.8H, #11 // ................................................*................. + // mls v9.8H, v12.8H, v7.H[0] // ..............................................*................... + // mls v23.8H, v19.8H, v7.H[0] // ..................................................*............... + // mls v17.8H, v21.8H, v7.H[0] // ...................................................*.............. + // sub v12.8H, v5.8H, v9.8H // ....................................................*............. + // add v5.8H, v5.8H, v9.8H // .....................................................*............ + // sub v9.8H, v23.8H, v17.8H // ......................................................*........... + // mul v21.8H, v12.8H, v4.H[0] // .......................................................*.......... + // sqrdmulh v12.8H, v12.8H, v4.H[1] // ........................................................*......... + // mul v1.8H, v9.8H, v4.H[0] // ...........................................................*...... + // sqrdmulh v4.8H, v9.8H, v4.H[1] // .........................................................*........ + // add v23.8H, v23.8H, v17.8H // ..........................................................*....... + // mls v21.8H, v12.8H, v7.H[0] // .............................................................*.... + // str q5, [x1, #16] // ............................................................*..... + // mls v1.8H, v4.8H, v7.H[0] // ...............................................................*.. + // str q23, [x1], #(64) // ..............................................................*... + // str q1, [x1, #-32] // .................................................................* + // str q21, [x1, #-16] // ................................................................*. + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + ldr q17, [x0, #64] // *...... + // gap // ....... + // gap // ....... + // gap // ....... + ldr q12, [x0, #128] // .*..... + // gap // ....... + // gap // ....... + // gap // ....... + ldr q18, [x0, #192] // ..*.... + // gap // ....... + // gap // ....... + // gap // ....... + ldr q28, [x0, #256] // ...*... + // gap // ....... + // gap // ....... + // gap // ....... + ldr q4, [x0, #320] // ....*.. + // gap // ....... + // gap // ....... + // gap // ....... + ldr q9, [x0, #384] // .....*. + // gap // ....... + // gap // ....... + // gap // ....... + ldr q21, [x0, #448] // ......* + // gap // ....... + + // original source code + // ldr q17, [x0, #64] // *...... + // ldr q12, [x0, #128] // .*..... + // ldr q18, [x0, #192] // ..*.... + // ldr q28, [x0, #256] // ...*... + // ldr q4, [x0, #320] // ....*.. + // ldr q9, [x0, #384] // .....*. + // ldr q21, [x0, #448] // ......* + + sub count, count, #1 +layer123_start: + ldr q8, [x0, #0] // *............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v23.8H, v12.8H, v18.8H // .............*................................................................................ + // gap // .............................................................................................. + add v12.8H, v12.8H, v18.8H // ..............*............................................................................... + // gap // .............................................................................................. + sub v18.8H, v8.8H, v17.8H // ........*..................................................................................... + // gap // .............................................................................................. + add v8.8H, v8.8H, v17.8H // .........*.................................................................................... + // gap // .............................................................................................. + mul v26.8H, v23.8H, v1.H[0] // ...............*.............................................................................. + // gap // .............................................................................................. + sqrdmulh v23.8H, v23.8H, v1.H[1] // ................*............................................................................. + // gap // .............................................................................................. + sub v17.8H, v8.8H, v12.8H // ............................*................................................................. + // gap // .............................................................................................. + add v8.8H, v8.8H, v12.8H // .............................*................................................................ + // gap // .............................................................................................. + mul v12.8H, v18.8H, v0.H[6] // ..........*................................................................................... + // gap // .............................................................................................. + sqrdmulh v18.8H, v18.8H, v0.H[7] // ...........*.................................................................................. + // gap // .............................................................................................. + mls v26.8H, v23.8H, v7.H[0] // .................*............................................................................ + // gap // .............................................................................................. + sub v23.8H, v28.8H, v4.8H // ..................*........................................................................... + // gap // .............................................................................................. + add v28.8H, v28.8H, v4.8H // ...................*.......................................................................... + // gap // .............................................................................................. + mls v12.8H, v18.8H, v7.H[0] // ............*................................................................................. + // gap // .............................................................................................. + mul v18.8H, v23.8H, v1.H[2] // ....................*......................................................................... + // gap // .............................................................................................. + mul v4.8H, v17.8H, v0.H[2] // ..............................*............................................................... + // gap // .............................................................................................. + sqrdmulh v17.8H, v17.8H, v0.H[3] // ...............................*.............................................................. + // gap // .............................................................................................. + sqdmulh v19.8H, v8.8H, v7.H[1] // ................................................*............................................. + // gap // .............................................................................................. + sqrdmulh v23.8H, v23.8H, v1.H[3] // .....................*........................................................................ + // gap // .............................................................................................. + sub v5.8H, v9.8H, v21.8H // .......................*...................................................................... + // gap // .............................................................................................. + add v9.8H, v9.8H, v21.8H // ........................*..................................................................... + // gap // .............................................................................................. + srshr v19.8H, v19.8H, #11 // .................................................*............................................ + // gap // .............................................................................................. + mls v18.8H, v23.8H, v7.H[0] // ......................*....................................................................... + // gap // .............................................................................................. + sub v23.8H, v12.8H, v26.8H // .................................*............................................................ + // gap // .............................................................................................. + mls v8.8H, v19.8H, v7.H[0] // ..................................................*........................................... + // gap // .............................................................................................. + add v12.8H, v12.8H, v26.8H // ..................................*........................................................... + // gap // .............................................................................................. + mul v26.8H, v5.8H, v1.H[4] // .........................*.................................................................... + // gap // .............................................................................................. + sub v19.8H, v28.8H, v9.8H // ......................................*....................................................... + // gap // .............................................................................................. + add v28.8H, v28.8H, v9.8H // .......................................*...................................................... + // gap // .............................................................................................. + sqrdmulh v5.8H, v5.8H, v1.H[5] // ..........................*................................................................... + // gap // .............................................................................................. + mls v4.8H, v17.8H, v7.H[0] // ................................*............................................................. + // gap // .............................................................................................. + mul v17.8H, v23.8H, v0.H[2] // ...................................*.......................................................... + // gap // .............................................................................................. + sqrdmulh v23.8H, v23.8H, v0.H[3] // ....................................*......................................................... + // gap // .............................................................................................. + mls v26.8H, v5.8H, v7.H[0] // ...........................*.................................................................. + // gap // .............................................................................................. + mul v5.8H, v19.8H, v0.H[4] // ........................................*..................................................... + // gap // .............................................................................................. + sqrdmulh v19.8H, v19.8H, v0.H[5] // .........................................*.................................................... + // gap // .............................................................................................. + mls v17.8H, v23.8H, v7.H[0] // .....................................*........................................................ + // gap // .............................................................................................. + sub v23.8H, v18.8H, v26.8H // ...........................................*.................................................. + // gap // .............................................................................................. + add v18.8H, v18.8H, v26.8H // ............................................*................................................. + // gap // .............................................................................................. + mls v5.8H, v19.8H, v7.H[0] // ..........................................*................................................... + // gap // .............................................................................................. + mul v26.8H, v23.8H, v0.H[4] // .............................................*................................................ + // gap // .............................................................................................. + sqrdmulh v23.8H, v23.8H, v0.H[5] // ..............................................*............................................... + // gap // .............................................................................................. + sqdmulh v19.8H, v28.8H, v7.H[1] // ...................................................*.......................................... + // gap // .............................................................................................. + sub v9.8H, v12.8H, v18.8H // ...........................................................*.................................. + // gap // .............................................................................................. + add v12.8H, v12.8H, v18.8H // ............................................................*................................. + // gap // .............................................................................................. + mls v26.8H, v23.8H, v7.H[0] // ...............................................*.............................................. + // gap // .............................................................................................. + srshr v23.8H, v19.8H, #11 // ....................................................*......................................... + // gap // .............................................................................................. + mul v18.8H, v9.8H, v0.H[0] // .............................................................*................................ + // gap // .............................................................................................. + sqrdmulh v19.8H, v9.8H, v0.H[1] // ..............................................................*............................... + // gap // .............................................................................................. + mls v28.8H, v23.8H, v7.H[0] // .....................................................*........................................ + // gap // .............................................................................................. + sub v23.8H, v4.8H, v5.8H // ................................................................*............................. + // gap // .............................................................................................. + add v4.8H, v4.8H, v5.8H // .................................................................*............................ + // gap // .............................................................................................. + mls v18.8H, v19.8H, v7.H[0] // ...............................................................*.............................. + // gap // .............................................................................................. + sub v19.8H, v8.8H, v28.8H // ......................................................*....................................... + // gap // .............................................................................................. + add v8.8H, v8.8H, v28.8H // .......................................................*...................................... + // gap // .............................................................................................. + mul v28.8H, v23.8H, v0.H[0] // ..................................................................*........................... + // gap // .............................................................................................. + mul v5.8H, v19.8H, v0.H[0] // ........................................................*..................................... + // gap // .............................................................................................. + sqrdmulh v19.8H, v19.8H, v0.H[1] // .........................................................*.................................... + // gap // .............................................................................................. + sqrdmulh v23.8H, v23.8H, v0.H[1] // ...................................................................*.......................... + // gap // .............................................................................................. + sub v9.8H, v17.8H, v26.8H // .....................................................................*........................ + // gap // .............................................................................................. + add v26.8H, v17.8H, v26.8H // ......................................................................*....................... + // gap // .............................................................................................. + mls v5.8H, v19.8H, v7.H[0] // ..........................................................*................................... + // gap // .............................................................................................. + mls v28.8H, v23.8H, v7.H[0] // ....................................................................*......................... + // gap // .............................................................................................. + mul v23.8H, v9.8H, v0.H[0] // .......................................................................*...................... + // gap // .............................................................................................. + sqrdmulh v17.8H, v9.8H, v0.H[1] // ........................................................................*..................... + // gap // .............................................................................................. + str q5, [x0, #256] // ..........................................................................*................... + // gap // .............................................................................................. + mul v19.8H, v8.8H, v29.8H // ..............................................................................*............... + // gap // .............................................................................................. + str q18, [x0, #320] // ...........................................................................*.................. + // gap // .............................................................................................. + mls v23.8H, v17.8H, v7.H[0] // .........................................................................*.................... + // gap // .............................................................................................. + str q28, [x0, #384] // ............................................................................*................. + // gap // .............................................................................................. + sqrdmulh v8.8H, v8.8H, v30.8H // ...............................................................................*.............. + // gap // .............................................................................................. + mul v18.8H, v12.8H, v29.8H // .................................................................................*............ + // gap // .............................................................................................. + str q23, [x0, #448] // .............................................................................*................ + // gap // .............................................................................................. + sqrdmulh v23.8H, v12.8H, v30.8H // ..................................................................................*........... + // gap // .............................................................................................. + mls v19.8H, v8.8H, v7.H[0] // ................................................................................*............. + // gap // .............................................................................................. + mul v8.8H, v4.8H, v29.8H // ....................................................................................*......... + // gap // .............................................................................................. + sqrdmulh v12.8H, v4.8H, v30.8H // .....................................................................................*........ + // gap // .............................................................................................. + mls v18.8H, v23.8H, v7.H[0] // ...................................................................................*.......... + // gap // .............................................................................................. + mul v23.8H, v26.8H, v29.8H // .......................................................................................*...... + // gap // .............................................................................................. + sqrdmulh v26.8H, v26.8H, v30.8H // ........................................................................................*..... + // gap // .............................................................................................. + mls v8.8H, v12.8H, v7.H[0] // ......................................................................................*....... + // gap // .............................................................................................. + str q19, [x0], #(16) // ..........................................................................................*... + // gap // .............................................................................................. + ldr q17, [x0, #64] // .e............................................................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v23.8H, v26.8H, v7.H[0] // .........................................................................................*.... + // gap // .............................................................................................. + str q18, [x0, #48] // ...........................................................................................*.. + // gap // .............................................................................................. + ldr q12, [x0, #128] // ..e........................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + str q8, [x0, #112] // ............................................................................................*. + // gap // .............................................................................................. + ldr q18, [x0, #192] // ...e.......................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + str q23, [x0, #176] // .............................................................................................* + // gap // .............................................................................................. + ldr q28, [x0, #256] // ....e......................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + ldr q4, [x0, #320] // .....e........................................................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + ldr q9, [x0, #384] // ......e....................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + ldr q21, [x0, #448] // .......e...................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + + // original source code + // ldr q8, [x0, #0] // ...........*......................................................................................... + // ldr q9, [x0, #(1*(512/8))] // e..........|..................................................................................e...... + // ldr q10, [x0, #(2*(512/8))] // ...e.......|.....................................................................................e... + // ldr q11, [x0, #(3*(512/8))] // .....e.....|.......................................................................................e. + // ldr q12, [x0, #(4*(512/8))] // .......e...|......................................................................................... + // ldr q13, [x0, #(5*(512/8))] // ........e..|......................................................................................... + // ldr q14, [x0, #(6*(512/8))] // .........e.|......................................................................................... + // ldr q15, [x0, #(7*(512/8))] // ..........e|......................................................................................... + // sub v24.8h, v8.8h, v9.8h // ...........|..*...................................................................................... + // add v8.8h, v8.8h, v9.8h // ...........|...*..................................................................................... + // mul v9.8h, v24.8h, v0.h[6] // ...........|........*................................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[7] // ...........|.........*............................................................................... + // mls v9.8h, v24.8h, v7.h[0] // ...........|.............*........................................................................... + // sub v24.8h, v10.8h, v11.8h // ...........|*........................................................................................ + // add v10.8h, v10.8h, v11.8h // ...........|.*....................................................................................... + // mul v11.8h, v24.8h, v1.h[0] // ...........|....*.................................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ...........|.....*................................................................................... + // mls v11.8h, v24.8h, v7.h[0] // ...........|..........*.............................................................................. + // sub v24.8h, v12.8h, v13.8h // ...........|...........*............................................................................. + // add v12.8h, v12.8h, v13.8h // ...........|............*............................................................................ + // mul v13.8h, v24.8h, v1.h[2] // ...........|..............*.......................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // ...........|..................*...................................................................... + // mls v13.8h, v24.8h, v7.h[0] // ...........|......................*.................................................................. + // sub v24.8h, v14.8h, v15.8h // ...........|...................*..................................................................... + // add v14.8h, v14.8h, v15.8h // ...........|....................*.................................................................... + // mul v15.8h, v24.8h, v1.h[4] // ...........|..........................*.............................................................. + // sqrdmulh v24.8h, v24.8h, v1.h[5] // ...........|.............................*........................................................... + // mls v15.8h, v24.8h, v7.h[0] // ...........|.................................*....................................................... + // sub v24.8h, v8.8h, v10.8h // ...........|......*.................................................................................. + // add v8.8h, v8.8h, v10.8h // ...........|.......*................................................................................. + // mul v10.8h, v24.8h, v0.h[2] // ...........|...............*......................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........|................*........................................................................ + // mls v10.8h, v24.8h, v7.h[0] // ...........|..............................*.......................................................... + // sub v24.8h, v9.8h, v11.8h // ...........|.......................*................................................................. + // add v9.8h, v9.8h, v11.8h // ...........|.........................*............................................................... + // mul v11.8h, v24.8h, v0.h[2] // ...........|...............................*......................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........|................................*........................................................ + // mls v11.8h, v24.8h, v7.h[0] // ...........|....................................*.................................................... + // sub v24.8h, v12.8h, v14.8h // ...........|...........................*............................................................. + // add v12.8h, v12.8h, v14.8h // ...........|............................*............................................................ + // mul v14.8h, v24.8h, v0.h[4] // ...........|..................................*...................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...........|...................................*..................................................... + // mls v14.8h, v24.8h, v7.h[0] // ...........|.......................................*................................................. + // sub v24.8h, v13.8h, v15.8h // ...........|.....................................*................................................... + // add v13.8h, v13.8h, v15.8h // ...........|......................................*.................................................. + // mul v15.8h, v24.8h, v0.h[4] // ...........|........................................*................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...........|.........................................*............................................... + // mls v15.8h, v24.8h, v7.h[0] // ...........|.............................................*........................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...........|.................*....................................................................... + // srshr v25.8h, v25.8h, #11 // ...........|.....................*................................................................... + // mls v8.8h, v25.8h, v7.h[0] // ...........|........................*................................................................ + // sqdmulh v25.8h, v12.8h, v7.h[1] // ...........|..........................................*.............................................. + // srshr v25.8h, v25.8h, #11 // ...........|..............................................*.......................................... + // mls v12.8h, v25.8h, v7.h[0] // ...........|.................................................*....................................... + // sub v24.8h, v8.8h, v12.8h // ...........|.....................................................*................................... + // add v8.8h, v8.8h, v12.8h // ...........|......................................................*.................................. + // mul v12.8h, v24.8h, v0.h[0] // ...........|........................................................*................................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|.........................................................*............................... + // mls v12.8h, v24.8h, v7.h[0] // ...........|.............................................................*........................... + // sub v24.8h, v9.8h, v13.8h // ...........|...........................................*............................................. + // add v9.8h, v9.8h, v13.8h // ...........|............................................*............................................ + // mul v13.8h, v24.8h, v0.h[0] // ...........|...............................................*......................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|................................................*........................................ + // mls v13.8h, v24.8h, v7.h[0] // ...........|....................................................*.................................... + // sub v24.8h, v10.8h, v14.8h // ...........|..................................................*...................................... + // add v10.8h, v10.8h, v14.8h // ...........|...................................................*..................................... + // mul v14.8h, v24.8h, v0.h[0] // ...........|.......................................................*................................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|..........................................................*.............................. + // mls v14.8h, v24.8h, v7.h[0] // ...........|..............................................................*.......................... + // sub v24.8h, v11.8h, v15.8h // ...........|...........................................................*............................. + // add v11.8h, v11.8h, v15.8h // ...........|............................................................*............................ + // mul v15.8h, v24.8h, v0.h[0] // ...........|...............................................................*......................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|................................................................*........................ + // mls v15.8h, v24.8h, v7.h[0] // ...........|....................................................................*.................... + // str q12, [x0, #(4*(512/8))] // ...........|.................................................................*....................... + // str q13, [x0, #(5*(512/8))] // ...........|...................................................................*..................... + // str q14, [x0, #(6*(512/8))] // ...........|.....................................................................*................... + // str q15, [x0, #(7*(512/8))] // ...........|........................................................................*................ + // mul v12.8h, v8.8h, v29.8h // ...........|..................................................................*...................... + // sqrdmulh v8.8h, v8.8h, v30.8h // ...........|......................................................................*.................. + // mls v12.8h, v8.8h, v7.h[0] // ...........|..........................................................................*.............. + // mul v13.8h, v9.8h, v29.8h // ...........|.......................................................................*................. + // sqrdmulh v9.8h, v9.8h, v30.8h // ...........|.........................................................................*............... + // mls v13.8h, v9.8h, v7.h[0] // ...........|.............................................................................*........... + // mul v14.8h, v10.8h, v29.8h // ...........|...........................................................................*............. + // sqrdmulh v10.8h, v10.8h, v30.8h // ...........|............................................................................*............ + // mls v14.8h, v10.8h, v7.h[0] // ...........|................................................................................*........ + // mul v15.8h, v11.8h, v29.8h // ...........|..............................................................................*.......... + // sqrdmulh v11.8h, v11.8h, v30.8h // ...........|...............................................................................*......... + // mls v15.8h, v11.8h, v7.h[0] // .*.........|...................................................................................*..... + // str q12, [x0], #(16) // ...........|.................................................................................*....... + // str q13, [x0, #(-16 + 1*(512/8))] // ..*........|....................................................................................*.... + // str q14, [x0, #(-16 + 2*(512/8))] // ....*......|......................................................................................*.. + // str q15, [x0, #(-16 + 3*(512/8))] // ......*....|........................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + ldr q16, [x0, #0] // *...................................................................................... + // gap // ....................................................................................... + // gap // ....................................................................................... + // gap // ....................................................................................... + add v5.8H, v9.8H, v21.8H // .....................*................................................................. + // gap // ....................................................................................... + add v31.8H, v12.8H, v18.8H // ..*.................................................................................... + // gap // ....................................................................................... + add v27.8H, v16.8H, v17.8H // ....*.................................................................................. + // gap // ....................................................................................... + sub v13.8H, v16.8H, v17.8H // ...*................................................................................... + // gap // ....................................................................................... + add v6.8H, v28.8H, v4.8H // .............*......................................................................... + // gap // ....................................................................................... + sub v16.8H, v27.8H, v31.8H // .......*............................................................................... + // gap // ....................................................................................... + sub v15.8H, v12.8H, v18.8H // .*..................................................................................... + // gap // ....................................................................................... + sub v17.8H, v6.8H, v5.8H // ............................*.......................................................... + // gap // ....................................................................................... + mul v8.8H, v16.8H, v0.H[2] // ................*...................................................................... + // gap // ....................................................................................... + sqrdmulh v23.8H, v16.8H, v0.H[3] // .................*..................................................................... + // gap // ....................................................................................... + mul v16.8H, v17.8H, v0.H[4] // ...................................*................................................... + // gap // ....................................................................................... + sqrdmulh v3.8H, v17.8H, v0.H[5] // ....................................*.................................................. + // gap // ....................................................................................... + mul v17.8H, v15.8H, v1.H[0] // .....*................................................................................. + // gap // ....................................................................................... + sqrdmulh v14.8H, v15.8H, v1.H[1] // ......*................................................................................ + // gap // ....................................................................................... + mls v8.8H, v23.8H, v7.H[0] // ...............................*....................................................... + // gap // ....................................................................................... + mls v16.8H, v3.8H, v7.H[0] // ........................................*.............................................. + // gap // ....................................................................................... + sub v26.8H, v28.8H, v4.8H // ............*.......................................................................... + // gap // ....................................................................................... + mls v17.8H, v14.8H, v7.H[0] // ...........*........................................................................... + // gap // ....................................................................................... + mul v28.8H, v13.8H, v0.H[6] // .........*............................................................................. + // gap // ....................................................................................... + sub v23.8H, v8.8H, v16.8H // ...................................................*................................... + // gap // ....................................................................................... + mul v4.8H, v26.8H, v1.H[2] // ...............*....................................................................... + // gap // ....................................................................................... + sqrdmulh v12.8H, v26.8H, v1.H[3] // ...................*................................................................... + // gap // ....................................................................................... + mul v18.8H, v23.8H, v0.H[0] // ........................................................*.............................. + // gap // ....................................................................................... + sqrdmulh v23.8H, v23.8H, v0.H[1] // ...........................................................*........................... + // gap // ....................................................................................... + sqrdmulh v26.8H, v13.8H, v0.H[7] // ..........*............................................................................ + // gap // ....................................................................................... + mls v4.8H, v12.8H, v7.H[0] // .......................*............................................................... + // gap // ....................................................................................... + sub v19.8H, v9.8H, v21.8H // ....................*.................................................................. + // gap // ....................................................................................... + mls v18.8H, v23.8H, v7.H[0] // ...............................................................*....................... + // gap // ....................................................................................... + mls v28.8H, v26.8H, v7.H[0] // ..............*........................................................................ + // gap // ....................................................................................... + sqrdmulh v12.8H, v19.8H, v1.H[5] // ..............................*........................................................ + // gap // ....................................................................................... + mul v19.8H, v19.8H, v1.H[4] // ...........................*........................................................... + // gap // ....................................................................................... + str q18, [x0, #384] // ......................................................................*................ + // gap // ....................................................................................... + sub v26.8H, v28.8H, v17.8H // ........................*.............................................................. + // gap // ....................................................................................... + add v5.8H, v6.8H, v5.8H // .............................*......................................................... + // gap // ....................................................................................... + mls v19.8H, v12.8H, v7.H[0] // ..................................*.................................................... + // gap // ....................................................................................... + sqrdmulh v6.8H, v26.8H, v0.H[3] // .................................*..................................................... + // gap // ....................................................................................... + sqdmulh v21.8H, v5.8H, v7.H[1] // ...........................................*........................................... + // gap // ....................................................................................... + add v2.8H, v27.8H, v31.8H // ........*.............................................................................. + // gap // ....................................................................................... + sub v23.8H, v4.8H, v19.8H // ......................................*................................................ + // gap // ....................................................................................... + mul v3.8H, v26.8H, v0.H[2] // ................................*...................................................... + // gap // ....................................................................................... + sqdmulh v26.8H, v2.8H, v7.H[1] // ..................*.................................................................... + // gap // ....................................................................................... + sqrdmulh v22.8H, v23.8H, v0.H[5] // ..........................................*............................................ + // gap // ....................................................................................... + mul v14.8H, v23.8H, v0.H[4] // .........................................*............................................. + // gap // ....................................................................................... + srshr v18.8H, v21.8H, #11 // ...............................................*....................................... + // gap // ....................................................................................... + srshr v12.8H, v26.8H, #11 // ......................*................................................................ + // gap // ....................................................................................... + mls v3.8H, v6.8H, v7.H[0] // .....................................*................................................. + // gap // ....................................................................................... + mls v14.8H, v22.8H, v7.H[0] // ..............................................*........................................ + // gap // ....................................................................................... + mls v5.8H, v18.8H, v7.H[0] // ..................................................*.................................... + // gap // ....................................................................................... + mls v2.8H, v12.8H, v7.H[0] // .........................*............................................................. + // gap // ....................................................................................... + add v6.8H, v28.8H, v17.8H // ..........................*............................................................ + // gap // ....................................................................................... + sub v28.8H, v3.8H, v14.8H // ............................................................*.......................... + // gap // ....................................................................................... + add v9.8H, v4.8H, v19.8H // .......................................*............................................... + // gap // ....................................................................................... + add v12.8H, v2.8H, v5.8H // .......................................................*............................... + // gap // ....................................................................................... + sqrdmulh v23.8H, v28.8H, v0.H[1] // .................................................................*..................... + // gap // ....................................................................................... + mul v28.8H, v28.8H, v0.H[0] // ................................................................*...................... + // gap // ....................................................................................... + sqrdmulh v17.8H, v12.8H, v30.8H // .......................................................................*............... + // gap // ....................................................................................... + sub v5.8H, v2.8H, v5.8H // ......................................................*................................ + // gap // ....................................................................................... + mul v18.8H, v12.8H, v29.8H // ...................................................................*................... + // gap // ....................................................................................... + mls v28.8H, v23.8H, v7.H[0] // .....................................................................*................. + // gap // ....................................................................................... + sqrdmulh v4.8H, v5.8H, v0.H[1] // ..........................................................*............................ + // gap // ....................................................................................... + mul v19.8H, v5.8H, v0.H[0] // .........................................................*............................. + // gap // ....................................................................................... + add v26.8H, v6.8H, v9.8H // .............................................*......................................... + // gap // ....................................................................................... + str q28, [x0, #448] // .........................................................................*............. + // gap // ....................................................................................... + add v28.8H, v8.8H, v16.8H // ....................................................*.................................. + // gap // ....................................................................................... + sqrdmulh v8.8H, v26.8H, v30.8H // ..........................................................................*............ + // gap // ....................................................................................... + mul v12.8H, v26.8H, v29.8H // ........................................................................*.............. + // gap // ....................................................................................... + mul v26.8H, v28.8H, v29.8H // ............................................................................*.......... + // gap // ....................................................................................... + sqrdmulh v23.8H, v28.8H, v30.8H // .............................................................................*......... + // gap // ....................................................................................... + sub v11.8H, v6.8H, v9.8H // ............................................*.......................................... + // gap // ....................................................................................... + mls v12.8H, v8.8H, v7.H[0] // ..............................................................................*........ + // gap // ....................................................................................... + add v3.8H, v3.8H, v14.8H // .............................................................*......................... + // gap // ....................................................................................... + mls v26.8H, v23.8H, v7.H[0] // .................................................................................*..... + // gap // ....................................................................................... + mul v28.8H, v11.8H, v0.H[0] // ................................................*...................................... + // gap // ....................................................................................... + str q12, [x0, #64] // ....................................................................................*.. + // gap // ....................................................................................... + sqrdmulh v23.8H, v11.8H, v0.H[1] // .................................................*..................................... + // gap // ....................................................................................... + str q26, [x0, #128] // .....................................................................................*. + // gap // ....................................................................................... + mls v19.8H, v4.8H, v7.H[0] // ..............................................................*........................ + // gap // ....................................................................................... + mul v12.8H, v3.8H, v29.8H // ...............................................................................*....... + // gap // ....................................................................................... + mls v28.8H, v23.8H, v7.H[0] // .....................................................*................................. + // gap // ....................................................................................... + sqrdmulh v8.8H, v3.8H, v30.8H // ................................................................................*...... + // gap // ....................................................................................... + str q19, [x0, #256] // ..................................................................*.................... + // gap // ....................................................................................... + mls v18.8H, v17.8H, v7.H[0] // ...........................................................................*........... + // gap // ....................................................................................... + str q28, [x0, #320] // ....................................................................*.................. + // gap // ....................................................................................... + mls v12.8H, v8.8H, v7.H[0] // ...................................................................................*... + // gap // ....................................................................................... + // gap // ....................................................................................... + // gap // ....................................................................................... + str q18, [x0], #(16) // ..................................................................................*.... + // gap // ....................................................................................... + // gap // ....................................................................................... + // gap // ....................................................................................... + str q12, [x0, #176] // ......................................................................................* + // gap // ....................................................................................... + + // original source code + // ldr q8, [x0, #0] // *...................................................................................... + // sub v23.8H, v12.8H, v18.8H // .......*............................................................................... + // add v12.8H, v12.8H, v18.8H // ..*.................................................................................... + // sub v18.8H, v8.8H, v17.8H // ....*.................................................................................. + // add v8.8H, v8.8H, v17.8H // ...*................................................................................... + // mul v26.8H, v23.8H, v1.H[0] // .............*......................................................................... + // sqrdmulh v23.8H, v23.8H, v1.H[1] // ..............*........................................................................ + // sub v17.8H, v8.8H, v12.8H // ......*................................................................................ + // add v8.8H, v8.8H, v12.8H // ......................................*................................................ + // mul v12.8H, v18.8H, v0.H[6] // ...................*................................................................... + // sqrdmulh v18.8H, v18.8H, v0.H[7] // .........................*............................................................. + // mls v26.8H, v23.8H, v7.H[0] // ..................*.................................................................... + // sub v23.8H, v28.8H, v4.8H // .................*..................................................................... + // add v28.8H, v28.8H, v4.8H // .....*................................................................................. + // mls v12.8H, v18.8H, v7.H[0] // .............................*......................................................... + // mul v18.8H, v23.8H, v1.H[2] // .....................*................................................................. + // mul v4.8H, v17.8H, v0.H[2] // .........*............................................................................. + // sqrdmulh v17.8H, v17.8H, v0.H[3] // ..........*............................................................................ + // sqdmulh v19.8H, v8.8H, v7.H[1] // .........................................*............................................. + // sqrdmulh v23.8H, v23.8H, v1.H[3] // ......................*................................................................ + // sub v5.8H, v9.8H, v21.8H // ...........................*........................................................... + // add v9.8H, v9.8H, v21.8H // .*..................................................................................... + // srshr v19.8H, v19.8H, #11 // .............................................*......................................... + // mls v18.8H, v23.8H, v7.H[0] // ..........................*............................................................ + // sub v23.8H, v12.8H, v26.8H // .................................*..................................................... + // mls v8.8H, v19.8H, v7.H[0] // .................................................*..................................... + // add v12.8H, v12.8H, v26.8H // ..................................................*.................................... + // mul v26.8H, v5.8H, v1.H[4] // ...............................*....................................................... + // sub v19.8H, v28.8H, v9.8H // ........*.............................................................................. + // add v28.8H, v28.8H, v9.8H // ..................................*.................................................... + // sqrdmulh v5.8H, v5.8H, v1.H[5] // ..............................*........................................................ + // mls v4.8H, v17.8H, v7.H[0] // ...............*....................................................................... + // mul v17.8H, v23.8H, v0.H[2] // ........................................*.............................................. + // sqrdmulh v23.8H, v23.8H, v0.H[3] // ....................................*.................................................. + // mls v26.8H, v5.8H, v7.H[0] // ...................................*................................................... + // mul v5.8H, v19.8H, v0.H[4] // ...........*........................................................................... + // sqrdmulh v19.8H, v19.8H, v0.H[5] // ............*.......................................................................... + // mls v17.8H, v23.8H, v7.H[0] // ..............................................*........................................ + // sub v23.8H, v18.8H, v26.8H // .......................................*............................................... + // add v18.8H, v18.8H, v26.8H // ....................................................*.................................. + // mls v5.8H, v19.8H, v7.H[0] // ................*...................................................................... + // mul v26.8H, v23.8H, v0.H[4] // ...........................................*........................................... + // sqrdmulh v23.8H, v23.8H, v0.H[5] // ..........................................*............................................ + // sqdmulh v19.8H, v28.8H, v7.H[1] // .....................................*................................................. + // sub v9.8H, v12.8H, v18.8H // .....................................................................*................. + // add v12.8H, v12.8H, v18.8H // ..............................................................*........................ + // mls v26.8H, v23.8H, v7.H[0] // ...............................................*....................................... + // srshr v23.8H, v19.8H, #11 // ............................................*.......................................... + // mul v18.8H, v9.8H, v0.H[0] // .........................................................................*............. + // sqrdmulh v19.8H, v9.8H, v0.H[1] // ...........................................................................*........... + // mls v28.8H, v23.8H, v7.H[0] // ................................................*...................................... + // sub v23.8H, v4.8H, v5.8H // ....................*.................................................................. + // add v4.8H, v4.8H, v5.8H // ................................................................*...................... + // mls v18.8H, v19.8H, v7.H[0] // ...............................................................................*....... + // sub v19.8H, v8.8H, v28.8H // .........................................................*............................. + // add v8.8H, v8.8H, v28.8H // .....................................................*................................. + // mul v28.8H, v23.8H, v0.H[0] // .......................*............................................................... + // mul v5.8H, v19.8H, v0.H[0] // .............................................................*......................... + // sqrdmulh v19.8H, v19.8H, v0.H[1] // ............................................................*.......................... + // sqrdmulh v23.8H, v23.8H, v0.H[1] // ........................*.............................................................. + // sub v9.8H, v17.8H, v26.8H // ...................................................*................................... + // add v26.8H, v17.8H, v26.8H // .......................................................................*............... + // mls v5.8H, v19.8H, v7.H[0] // .............................................................................*......... + // mls v28.8H, v23.8H, v7.H[0] // ............................*.......................................................... + // mul v23.8H, v9.8H, v0.H[0] // .......................................................*............................... + // sqrdmulh v17.8H, v9.8H, v0.H[1] // ......................................................*................................ + // str q5, [x0, #256] // .................................................................................*..... + // mul v19.8H, v8.8H, v29.8H // ..........................................................*............................ + // str q18, [x0, #320] // ...................................................................................*... + // mls v23.8H, v17.8H, v7.H[0] // ...........................................................*........................... + // str q28, [x0, #384] // ................................*...................................................... + // sqrdmulh v8.8H, v8.8H, v30.8H // ........................................................*.............................. + // mul v18.8H, v12.8H, v29.8H // ..................................................................*.................... + // str q23, [x0, #448] // ...............................................................*....................... + // sqrdmulh v23.8H, v12.8H, v30.8H // .................................................................*..................... + // mls v19.8H, v8.8H, v7.H[0] // ..................................................................................*.... + // mul v8.8H, v4.8H, v29.8H // ...................................................................*................... + // sqrdmulh v12.8H, v4.8H, v30.8H // ....................................................................*.................. + // mls v18.8H, v23.8H, v7.H[0] // ......................................................................*................ + // mul v23.8H, v26.8H, v29.8H // ..............................................................................*........ + // sqrdmulh v26.8H, v26.8H, v30.8H // ................................................................................*...... + // mls v8.8H, v12.8H, v7.H[0] // ........................................................................*.............. + // str q19, [x0], #(16) // .....................................................................................*. + // mls v23.8H, v26.8H, v7.H[0] // ....................................................................................*.. + // str q18, [x0, #48] // ..........................................................................*............ + // str q8, [x0, #112] // ............................................................................*.......... + // str q23, [x0, #176] // ......................................................................................* + + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_a72.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_a72.s new file mode 100644 index 0000000..67eb850 --- /dev/null +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_a72.s @@ -0,0 +1,1845 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_opt_a72 + .global _intt_kyber_123_4567_opt_a72 + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_opt_a72: +_intt_kyber_123_4567_opt_a72: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 + ldr q0, [x1, #16] // *................................................. + // gap // .................................................. + ldr q29, [x1, #0] // .*................................................ + ldr q26, [x1, #32] // .....*............................................ + ldr q2, [x1, #48] // ....*............................................. + // gap // .................................................. + ldr q13, [x4], #(6*16) // ......*........................................... + ldr q16, [x4, #-64] // ......................*........................... + // gap // .................................................. + ldr q23, [x4, #-80] // ........*......................................... + // gap // .................................................. + // gap // .................................................. + trn2 v1.4S, v29.4S, v0.4S // .......*.......................................... + trn1 v0.4S, v29.4S, v0.4S // .........*........................................ + ldr q18, [x4, #-32] // ................*................................. + trn1 v8.4S, v26.4S, v2.4S // ...........*...................................... + trn2 v3.4S, v26.4S, v2.4S // ..........*....................................... + ldr q21, [x4, #-48] // ..*............................................... + ldr q24, [x4, #-16] // ...*.............................................. + // gap // .................................................. + // gap // .................................................. + ldr q4, [x3], #16 // ............................................*..... + // gap // .................................................. + // gap // .................................................. + trn2 v29.2D, v0.2D, v8.2D // .............*.................................... + trn2 v26.2D, v1.2D, v3.2D // ............*..................................... + // gap // .................................................. + trn1 v11.2D, v0.2D, v8.2D // ...............*.................................. + // gap // .................................................. + // gap // .................................................. + trn1 v30.2D, v1.2D, v3.2D // ..............*................................... + // gap // .................................................. + // gap // .................................................. + sub v2.8H, v29.8H, v26.8H // .................*................................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v20.8H, v11.8H, v30.8H // ...................*.............................. + // gap // .................................................. + // gap // .................................................. + mul v0.8H, v2.8H, v18.8H // ....................*............................. + add v31.8H, v11.8H, v30.8H // .........................*........................ + // gap // .................................................. + add v18.8H, v29.8H, v26.8H // ..................*............................... + // gap // .................................................. + // gap // .................................................. + sqrdmulh v5.8H, v20.8H, v21.8H // .....................*............................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v8.8H, v31.8H, v18.8H // ............................*..................... + sqrdmulh v30.8H, v2.8H, v24.8H // .......................*.......................... + // gap // .................................................. + sub v15.8H, v31.8H, v18.8H // ...........................*...................... + // gap // .................................................. + // gap // .................................................. + mul v17.8H, v20.8H, v16.8H // ........................*......................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v17.8H, v5.8H, v7.H[0] // ..........................*....................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v0.8H, v30.8H, v7.H[0] // .............................*.................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mul v31.8H, v15.8H, v13.8H // .................................*................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v24.8H, v17.8H, v0.8H // ................................*................. + sub v21.8H, v17.8H, v0.8H // ...............................*.................. + // gap // .................................................. + sqrdmulh v0.8H, v15.8H, v23.8H // ..............................*................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn2 v9.4S, v8.4S, v24.4S // .......................................*.......... + sqrdmulh v16.8H, v21.8H, v23.8H // ..................................*............... + // gap // .................................................. + trn1 v20.4S, v8.4S, v24.4S // ....................................*............. + // gap // .................................................. + // gap // .................................................. + mul v26.8H, v21.8H, v13.8H // .....................................*............ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v31.8H, v0.8H, v7.H[0] // ...................................*.............. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v26.8H, v16.8H, v7.H[0] // ......................................*........... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn1 v23.4S, v31.4S, v26.4S // ........................................*......... + trn2 v27.4S, v31.4S, v26.4S // .........................................*........ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn1 v26.2D, v20.2D, v23.2D // ..........................................*....... + // gap // .................................................. + trn1 v28.2D, v9.2D, v27.2D // ...........................................*...... + trn2 v3.2D, v9.2D, v27.2D // ..............................................*... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v2.8H, v26.8H, v28.8H // .............................................*.... + add v24.8H, v26.8H, v28.8H // ...............................................*.. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v26.8H, v2.8H, v4.H[3] // ................................................*. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mul v17.8H, v2.8H, v4.H[2] // .................................................* + // gap // .................................................. + // gap // .................................................. + + // original source code + // ldr q12, [x1, #16] // *................................................. + // ldr q28, [x1, #0] // .*................................................ + // ldr q30, [x4, #48] // ............*..................................... + // ldr q6, [x4, #80] // .............*.................................... + // ldr q21, [x1, #48] // ...*.............................................. + // ldr q22, [x1, #32] // ..*............................................... + // ldr q9, [x4], #(6*16) // ....*............................................. + // trn2 v19.4S, v28.4S, v12.4S // .......*.......................................... + // ldr q10, [x4, #-80] // ......*........................................... + // trn1 v13.4S, v28.4S, v12.4S // ........*......................................... + // trn2 v23.4S, v22.4S, v21.4S // ...........*...................................... + // trn1 v20.4S, v22.4S, v21.4S // ..........*....................................... + // trn2 v21.2D, v19.2D, v23.2D // ................*................................. + // trn2 v16.2D, v13.2D, v20.2D // ...............*.................................. + // trn1 v1.2D, v19.2D, v23.2D // ..................*............................... + // trn1 v11.2D, v13.2D, v20.2D // .................*................................ + // ldr q23, [x4, #-32] // .........*........................................ + // sub v2.8H, v16.8H, v21.8H // ...................*.............................. + // add v16.8H, v16.8H, v21.8H // .......................*.......................... + // sub v21.8H, v11.8H, v1.8H // ....................*............................. + // mul v26.8H, v2.8H, v23.8H // .....................*............................ + // sqrdmulh v23.8H, v21.8H, v30.8H // ........................*......................... + // ldr q25, [x4, #-64] // .....*............................................ + // sqrdmulh v2.8H, v2.8H, v6.8H // ..........................*....................... + // mul v3.8H, v21.8H, v25.8H // ............................*..................... + // add v21.8H, v11.8H, v1.8H // ......................*........................... + // mls v3.8H, v23.8H, v7.H[0] // .............................*.................... + // sub v6.8H, v21.8H, v16.8H // ...........................*...................... + // add v1.8H, v21.8H, v16.8H // .........................*........................ + // mls v26.8H, v2.8H, v7.H[0] // ..............................*................... + // sqrdmulh v23.8H, v6.8H, v10.8H // ..................................*............... + // sub v16.8H, v3.8H, v26.8H // .................................*................ + // add v0.8H, v3.8H, v26.8H // ................................*................. + // mul v21.8H, v6.8H, v9.8H // ...............................*.................. + // sqrdmulh v2.8H, v16.8H, v10.8H // ....................................*............. + // mls v21.8H, v23.8H, v7.H[0] // .......................................*.......... + // trn1 v20.4S, v1.4S, v0.4S // .....................................*............ + // mul v16.8H, v16.8H, v9.8H // ......................................*........... + // mls v16.8H, v2.8H, v7.H[0] // ........................................*......... + // trn2 v2.4S, v1.4S, v0.4S // ...................................*.............. + // trn1 v23.4S, v21.4S, v16.4S // .........................................*........ + // trn2 v8.4S, v21.4S, v16.4S // ..........................................*....... + // trn1 v16.2D, v20.2D, v23.2D // ...........................................*...... + // trn1 v0.2D, v2.2D, v8.2D // ............................................*..... + // ldr q4, [x3], #16 // ..............*................................... + // sub v5.8H, v16.8H, v0.8H // ..............................................*... + // trn2 v3.2D, v2.2D, v8.2D // .............................................*.... + // add v24.8H, v16.8H, v0.8H // ...............................................*.. + // sqrdmulh v26.8H, v5.8H, v4.H[3] // ................................................*. + // mul v17.8H, v5.8H, v4.H[2] // .................................................* + + sub count, count, #1 +layer4567_start: + trn2 v31.2D, v20.2D, v23.2D // ..........................................*.................................. + ldr q12, [x1, #80] // .e........................................................................... + ldr q28, [x1, #64] // e............................................................................ + ldr q30, [x4, #48] // ...............e............................................................. + sqdmulh v2.8H, v24.8H, v7.H[1] // .........................................................*................... + ldr q6, [x4, #80] // .................e........................................................... + ldr q21, [x1, #112] // ...e......................................................................... + // gap // ............................................................................. + ldr q22, [x1, #96] // ..e.......................................................................... + ldr q9, [x4], #(6*16) // ............e................................................................ + mls v17.8H, v26.8H, v7.H[0] // ...................................................*......................... + add v15.8H, v31.8H, v3.8H // .....................................................*....................... + trn2 v19.4S, v28.4S, v12.4S // .....e....................................................................... + ldr q10, [x4, #-80] // .............e............................................................... + // gap // ............................................................................. + trn1 v13.4S, v28.4S, v12.4S // ....e........................................................................ + sub v0.8H, v31.8H, v3.8H // ....................................................*........................ + // gap // ............................................................................. + trn2 v23.4S, v22.4S, v21.4S // .......e..................................................................... + sqdmulh v14.8H, v15.8H, v7.H[1] // ............................................................*................ + // gap // ............................................................................. + trn1 v20.4S, v22.4S, v21.4S // ......e...................................................................... + // gap // ............................................................................. + // gap // ............................................................................. + sqrdmulh v8.8H, v0.8H, v4.H[5] // .......................................................*..................... + srshr v2.8H, v2.8H, #11 // ..........................................................*.................. + // gap // ............................................................................. + trn2 v21.2D, v19.2D, v23.2D // .........e................................................................... + // gap // ............................................................................. + // gap // ............................................................................. + trn2 v16.2D, v13.2D, v20.2D // ........e.................................................................... + mul v0.8H, v0.8H, v4.H[4] // ......................................................*...................... + // gap // ............................................................................. + trn1 v1.2D, v19.2D, v23.2D // ...........e................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + mls v24.8H, v2.8H, v7.H[0] // ...........................................................*................. + trn1 v11.2D, v13.2D, v20.2D // ..........e.................................................................. + ldr q23, [x4, #-32] // ................e............................................................ + sub v2.8H, v16.8H, v21.8H // .......................e..................................................... + // gap // ............................................................................. + // gap // ............................................................................. + add v16.8H, v16.8H, v21.8H // ........................e.................................................... + mls v0.8H, v8.8H, v7.H[0] // ........................................................*.................... + // gap // ............................................................................. + sub v21.8H, v11.8H, v1.8H // ..................e.......................................................... + // gap // ............................................................................. + // gap // ............................................................................. + mul v26.8H, v2.8H, v23.8H // .........................e................................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + sqrdmulh v23.8H, v21.8H, v30.8H // .....................e....................................................... + // gap // ............................................................................. + ldr q25, [x4, #-64] // ..............e.............................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + sqrdmulh v2.8H, v2.8H, v6.8H // ..........................e.................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + mul v3.8H, v21.8H, v25.8H // ....................e........................................................ + add v21.8H, v11.8H, v1.8H // ...................e......................................................... + // gap // ............................................................................. + sub v11.8H, v17.8H, v0.8H // ....................................................................*........ + // gap // ............................................................................. + // gap // ............................................................................. + mls v3.8H, v23.8H, v7.H[0] // ......................e...................................................... + add v0.8H, v17.8H, v0.8H // .....................................................................*....... + // gap // ............................................................................. + sub v6.8H, v21.8H, v16.8H // ............................e................................................ + // gap // ............................................................................. + // gap // ............................................................................. + add v1.8H, v21.8H, v16.8H // .............................e............................................... + mls v26.8H, v2.8H, v7.H[0] // ...........................e................................................. + // gap // ............................................................................. + str q0, [x1, #16] // ..........................................................................*.. + srshr v0.8H, v14.8H, #11 // .............................................................*............... + // gap // ............................................................................. + sqrdmulh v23.8H, v6.8H, v10.8H // ...............................e............................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + mls v15.8H, v0.8H, v7.H[0] // ..............................................................*.............. + // gap // ............................................................................. + // gap // ............................................................................. + sub v16.8H, v3.8H, v26.8H // .................................e........................................... + // gap // ............................................................................. + // gap // ............................................................................. + add v0.8H, v3.8H, v26.8H // ..................................e.......................................... + // gap // ............................................................................. + mul v21.8H, v6.8H, v9.8H // ..............................e.............................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + sqrdmulh v2.8H, v16.8H, v10.8H // ....................................e........................................ + // gap // ............................................................................. + // gap // ............................................................................. + sub v14.8H, v24.8H, v15.8H // ...............................................................*............. + // gap // ............................................................................. + // gap // ............................................................................. + add v8.8H, v24.8H, v15.8H // ................................................................*............ + mls v21.8H, v23.8H, v7.H[0] // ................................e............................................ + // gap // ............................................................................. + trn1 v20.4S, v1.4S, v0.4S // ......................................e...................................... + // gap // ............................................................................. + // gap // ............................................................................. + mul v16.8H, v16.8H, v9.8H // ...................................e......................................... + // gap // ............................................................................. + // gap // ............................................................................. + str q8, [x1], #(64) // .........................................................................*... + // gap // ............................................................................. + // gap // ............................................................................. + mls v16.8H, v2.8H, v7.H[0] // .....................................e....................................... + trn2 v2.4S, v1.4S, v0.4S // .......................................e..................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + sqrdmulh v0.8H, v11.8H, v4.H[1] // .......................................................................*..... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + sqrdmulh v3.8H, v14.8H, v4.H[1] // ..................................................................*.......... + trn1 v23.4S, v21.4S, v16.4S // ........................................e.................................... + // gap // ............................................................................. + // gap // ............................................................................. + mul v1.8H, v11.8H, v4.H[0] // ......................................................................*...... + trn2 v8.4S, v21.4S, v16.4S // .........................................e................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + mls v1.8H, v0.8H, v7.H[0] // ........................................................................*.... + // gap // ............................................................................. + trn1 v16.2D, v20.2D, v23.2D // ............................................e................................ + trn1 v0.2D, v2.2D, v8.2D // .............................................e............................... + // gap // ............................................................................. + // gap // ............................................................................. + mul v9.8H, v14.8H, v4.H[0] // .................................................................*........... + ldr q4, [x3], #16 // ..............................................e.............................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + mls v9.8H, v3.8H, v7.H[0] // ...................................................................*......... + // gap // ............................................................................. + sub v5.8H, v16.8H, v0.8H // ...............................................e............................. + trn2 v3.2D, v2.2D, v8.2D // ...........................................e................................. + str q1, [x1, #-16] // ............................................................................* + // gap // ............................................................................. + add v24.8H, v16.8H, v0.8H // ................................................e............................ + // gap // ............................................................................. + // gap // ............................................................................. + sqrdmulh v26.8H, v5.8H, v4.H[3] // ..................................................e.......................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + str q9, [x1, #-32] // ...........................................................................*. + mul v17.8H, v5.8H, v4.H[2] // .................................................e........................... + // gap // ............................................................................. + + // original source code + // ldr q8, [x1, #(16*0)] // .e..........................................................................|.e......................................................................... + // ldr q9, [x1, #(16*1)] // e...........................................................................|e.......................................................................... + // ldr q10, [x1, #(16*2)] // ......e.....................................................................|......e.................................................................... + // ldr q11, [x1, #(16*3)] // .....e......................................................................|.....e..................................................................... + // trn1 v25.4s, v8.4s, v9.4s // ............e...............................................................|............e.............................................................. + // trn2 v26.4s, v8.4s, v9.4s // ..........e.................................................................|..........e................................................................ + // trn1 v27.4s, v10.4s, v11.4s // ................e...........................................................|................e.......................................................... + // trn2 v28.4s, v10.4s, v11.4s // ..............e.............................................................|..............e............................................................ + // trn2 v10.2d, v25.2d, v27.2d // ....................e.......................................................|....................e...................................................... + // trn2 v11.2d, v26.2d, v28.2d // ...................e........................................................|...................e....................................................... + // trn1 v8.2d, v25.2d, v27.2d // ........................e...................................................|........................e.................................................. + // trn1 v9.2d, v26.2d, v28.2d // ......................e.....................................................|......................e.................................................... + // ldr q0, [x4], #(6*16) // .......e....................................................................|.......e................................................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // ...........e................................................................|...........e............................................................... + // ldr q1, [x4, #(-6*16 + 2*16)] // ................................e...........................................|................................e.......................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ..e.........................................................................|..e........................................................................ + // ldr q2, [x4, #(-6*16 + 4*16)] // .........................e..................................................|.........................e................................................. + // ldr q6, [x4, #(-6*16 + 5*16)] // ....e.......................................................................|....e...................................................................... + // sub v24.8h, v8.8h, v9.8h // .............................e..............................................|.............................e............................................. + // add v8.8h, v8.8h, v9.8h // ...................................e........................................|...................................e....................................... + // mul v9.8h, v24.8h, v1.8h // ..................................e.........................................|..................................e........................................ + // sqrdmulh v24.8h, v24.8h, v5.8h // ...............................e............................................|...............................e........................................... + // mls v9.8h, v24.8h, v7.h[0] // .....................................e......................................|.....................................e..................................... + // sub v24.8h, v10.8h, v11.8h // ..........................e.................................................|..........................e................................................ + // add v10.8h, v10.8h, v11.8h // ...........................e................................................|...........................e............................................... + // mul v11.8h, v24.8h, v2.8h // ..............................e.............................................|..............................e............................................ + // sqrdmulh v24.8h, v24.8h, v6.8h // .................................e..........................................|.................................e......................................... + // mls v11.8h, v24.8h, v7.h[0] // .........................................e..................................|.........................................e................................. + // sub v24.8h, v8.8h, v10.8h // .......................................e....................................|.......................................e................................... + // add v8.8h, v8.8h, v10.8h // ........................................e...................................|........................................e.................................. + // mul v10.8h, v24.8h, v0.8h // ................................................e...........................|................................................e.......................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ............................................e...............................|............................................e.............................. + // mls v10.8h, v24.8h, v7.h[0] // ....................................................e.......................|....................................................e...................... + // sub v24.8h, v9.8h, v11.8h // ..............................................e.............................|..............................................e............................ + // add v9.8h, v9.8h, v11.8h // ...............................................e............................|...............................................e........................... + // mul v11.8h, v24.8h, v0.8h // ......................................................e.....................|......................................................e.................... + // sqrdmulh v24.8h, v24.8h, v4.8h // .................................................e..........................|.................................................e......................... + // mls v11.8h, v24.8h, v7.h[0] // ........................................................e...................|........................................................e.................. + // trn1 v25.4s, v8.4s, v9.4s // .....................................................e......................|.....................................................e..................... + // trn2 v26.4s, v8.4s, v9.4s // .........................................................e..................|.........................................................e................. + // trn1 v27.4s, v10.4s, v11.4s // ............................................................e...............|............................................................e.............. + // trn2 v28.4s, v10.4s, v11.4s // ..............................................................e.............|..............................................................e............ + // trn2 v10.2d, v25.2d, v27.2d // ............................................................................*........................................................................... + // trn2 v11.2d, v26.2d, v28.2d // ......................................................................e.....|......................................................................e.... + // trn1 v8.2d, v25.2d, v27.2d // ................................................................e...........|................................................................e.......... + // trn1 v9.2d, v26.2d, v28.2d // .................................................................e..........|.................................................................e......... + // ldr q0, [x3], #16 // ...................................................................e........|...................................................................e....... + // sub v24.8h, v8.8h, v9.8h // .....................................................................e......|.....................................................................e..... + // add v8.8h, v8.8h, v9.8h // ........................................................................e...|........................................................................e.. + // mul v9.8h, v24.8h, v0.h[2] // ...........................................................................e|........................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .........................................................................e..|.........................................................................e. + // mls v9.8h, v24.8h, v7.h[0] // ........*...................................................................|........*.................................................................. + // sub v24.8h, v10.8h, v11.8h // .............*..............................................................|.............*............................................................. + // add v10.8h, v10.8h, v11.8h // .........*..................................................................|.........*................................................................. + // mul v11.8h, v24.8h, v0.h[4] // .....................*......................................................|.....................*..................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .................*..........................................................|.................*......................................................... + // mls v11.8h, v24.8h, v7.h[0] // ............................*...............................................|............................*.............................................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...*........................................................................|...*....................................................................... + // srshr v25.8h, v25.8h, #11 // ..................*.........................................................|..................*........................................................ + // mls v8.8h, v25.8h, v7.h[0] // .......................*....................................................|.......................*................................................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ...............*............................................................|...............*........................................................... + // srshr v25.8h, v25.8h, #11 // ...........................................*................................|...........................................*............................... + // mls v10.8h, v25.8h, v7.h[0] // .............................................*..............................|.............................................*............................. + // sub v24.8h, v8.8h, v10.8h // ..................................................*.........................|..................................................*........................ + // add v8.8h, v8.8h, v10.8h // ...................................................*........................|...................................................*....................... + // mul v10.8h, v24.8h, v0.h[0] // ..................................................................*.........|..................................................................*........ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........................................................*................|...........................................................*............... + // mls v10.8h, v24.8h, v7.h[0] // ....................................................................*.......|....................................................................*...... + // sub v24.8h, v9.8h, v11.8h // ....................................*.......................................|....................................*...................................... + // add v9.8h, v9.8h, v11.8h // ......................................*.....................................|......................................*.................................... + // mul v11.8h, v24.8h, v0.h[0] // .............................................................*..............|.............................................................*............. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........................................................*.................|..........................................................*................ + // mls v11.8h, v24.8h, v7.h[0] // ...............................................................*............|...............................................................*........... + // str q8, [x1], #(64) // .......................................................*....................|.......................................................*................... + // str q9, [x1, #(-64 + 16*1)] // ..........................................*.................................|..........................................*................................ + // str q10, [x1, #(-64 + 16*2)] // ..........................................................................*.|..........................................................................* + // str q11, [x1, #(-64 + 16*3)] // .......................................................................*....|.......................................................................*... + + sub count, count, #1 + cbnz count, layer4567_start + mls v17.8H, v26.8H, v7.H[0] // ..*........................ + trn2 v8.2D, v20.2D, v23.2D // *.......................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + sqdmulh v26.8H, v24.8H, v7.H[1] // .*......................... + // gap // ........................... + // gap // ........................... + sub v13.8H, v8.8H, v3.8H // ....*...................... + // gap // ........................... + // gap // ........................... + add v19.8H, v8.8H, v3.8H // ...*....................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + sqrdmulh v31.8H, v13.8H, v4.H[5] // ......*.................... + // gap // ........................... + // gap // ........................... + srshr v9.8H, v26.8H, #11 // .......*................... + // gap // ........................... + // gap // ........................... + sqdmulh v26.8H, v19.8H, v7.H[1] // .....*..................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mul v6.8H, v13.8H, v4.H[4] // ........*.................. + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v6.8H, v31.8H, v7.H[0] // ..........*................ + // gap // ........................... + // gap // ........................... + srshr v0.8H, v26.8H, #11 // ..............*............ + // gap // ........................... + // gap // ........................... + mls v24.8H, v9.8H, v7.H[0] // .........*................. + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v19.8H, v0.8H, v7.H[0] // ...............*........... + // gap // ........................... + // gap // ........................... + sub v9.8H, v17.8H, v6.8H // ...........*............... + // gap // ........................... + // gap // ........................... + add v18.8H, v17.8H, v6.8H // ............*.............. + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mul v6.8H, v9.8H, v4.H[0] // .....................*..... + // gap // ........................... + // gap // ........................... + str q18, [x1, #16] // .............*............. + sub v12.8H, v24.8H, v19.8H // ................*.......... + // gap // ........................... + sqrdmulh v25.8H, v9.8H, v4.H[1] // ...................*....... + add v14.8H, v24.8H, v19.8H // .................*......... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + sqrdmulh v8.8H, v12.8H, v4.H[1] // ....................*...... + // gap // ........................... + // gap // ........................... + str q14, [x1], #(64) // ..................*........ + // gap // ........................... + // gap // ........................... + mul v9.8H, v12.8H, v4.H[0] // .......................*... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v6.8H, v25.8H, v7.H[0] // ......................*.... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v9.8H, v8.8H, v7.H[0] // ........................*.. + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + str q6, [x1, #-16] // .........................*. + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + str q9, [x1, #-32] // ..........................* + // gap // ........................... + // gap // ........................... + + // original source code + // trn2 v31.2D, v20.2D, v23.2D // .*......................... + // sqdmulh v2.8H, v24.8H, v7.H[1] // ..*........................ + // mls v17.8H, v26.8H, v7.H[0] // *.......................... + // add v15.8H, v31.8H, v3.8H // ....*...................... + // sub v0.8H, v31.8H, v3.8H // ...*....................... + // sqdmulh v14.8H, v15.8H, v7.H[1] // .......*................... + // sqrdmulh v8.8H, v0.8H, v4.H[5] // .....*..................... + // srshr v2.8H, v2.8H, #11 // ......*.................... + // mul v0.8H, v0.8H, v4.H[4] // ........*.................. + // mls v24.8H, v2.8H, v7.H[0] // ...........*............... + // mls v0.8H, v8.8H, v7.H[0] // .........*................. + // sub v11.8H, v17.8H, v0.8H // .............*............. + // add v0.8H, v17.8H, v0.8H // ..............*............ + // str q0, [x1, #16] // ................*.......... + // srshr v0.8H, v14.8H, #11 // ..........*................ + // mls v15.8H, v0.8H, v7.H[0] // ............*.............. + // sub v14.8H, v24.8H, v15.8H // .................*......... + // add v8.8H, v24.8H, v15.8H // ...................*....... + // str q8, [x1], #(64) // .....................*..... + // sqrdmulh v0.8H, v11.8H, v4.H[1] // ..................*........ + // sqrdmulh v3.8H, v14.8H, v4.H[1] // ....................*...... + // mul v1.8H, v11.8H, v4.H[0] // ...............*........... + // mls v1.8H, v0.8H, v7.H[0] // .......................*... + // mul v9.8H, v14.8H, v4.H[0] // ......................*.... + // mls v9.8H, v3.8H, v7.H[0] // ........................*.. + // str q1, [x1, #-16] // .........................*. + // str q9, [x1, #-32] // ..........................* + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + ldr q9, [x0, #448] // ...*.......... + ldr q14, [x0, #384] // .*............ + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + ldr q8, [x0, #64] // *............. + // gap // .............. + ldr q12, [x0, #0] // ......*....... + // gap // .............. + // gap // .............. + sub v15.8H, v14.8H, v9.8H // ....*......... + // gap // .............. + // gap // .............. + add v20.8H, v14.8H, v9.8H // .........*.... + ldr q25, [x0, #192] // .....*........ + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + mul v11.8H, v15.8H, v1.H[4] // ...........*.. + add v3.8H, v12.8H, v8.8H // .............* + // gap // .............. + // gap // .............. + sub v8.8H, v12.8H, v8.8H // ........*..... + // gap // .............. + sqrdmulh v14.8H, v15.8H, v1.H[5] // .......*...... + // gap // .............. + // gap // .............. + ldr q10, [x0, #256] // ..*........... + // gap // .............. + // gap // .............. + ldr q17, [x0, #128] // ..........*... + sqrdmulh v19.8H, v8.8H, v0.H[7] // ............*. + // gap // .............. + + // original source code + // ldr q18, [x0, #64] // ..*........... + // ldr q27, [x0, #384] // .*............ + // ldr q10, [x0, #256] // ...........*.. + // ldr q17, [x0, #448] // *............. + // sub v12.8H, v27.8H, v17.8H // ....*......... + // ldr q25, [x0, #192] // ......*....... + // ldr q16, [x0, #0] // ...*.......... + // sqrdmulh v14.8H, v12.8H, v1.H[5] // ..........*... + // sub v8.8H, v16.8H, v18.8H // .........*.... + // add v20.8H, v27.8H, v17.8H // .....*........ + // ldr q17, [x0, #128] // ............*. + // mul v11.8H, v12.8H, v1.H[4] // .......*...... + // sqrdmulh v19.8H, v8.8H, v0.H[7] // .............* + // add v3.8H, v16.8H, v18.8H // ........*..... + + sub count, count, #1 +layer123_start: + sub v26.8H, v17.8H, v25.8H // .............*................................................................................ + // gap // .............................................................................................. + ldr q5, [x0, #320] // .....*........................................................................................ + ldr q18, [x0, #80] // .e............................................................................................ + add v4.8H, v17.8H, v25.8H // ..............*............................................................................... + mls v11.8H, v14.8H, v7.H[0] // ...........................*.................................................................. + // gap // .............................................................................................. + ldr q27, [x0, #400] // ......e....................................................................................... + // gap // .............................................................................................. + mul v17.8H, v26.8H, v1.H[0] // ...............*.............................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v21.8H, v10.8H, v5.8H // ..................*........................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + add v23.8H, v10.8H, v5.8H // ...................*.......................................................................... + mul v10.8H, v8.8H, v0.H[6] // ..........*................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v10.8H, v19.8H, v7.H[0] // ............*................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v28.8H, v23.8H, v20.8H // ......................................*....................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + add v13.8H, v23.8H, v20.8H // .......................................*...................................................... + mul v16.8H, v21.8H, v1.H[2] // ....................*......................................................................... + // gap // .............................................................................................. + add v20.8H, v3.8H, v4.8H // .............................*................................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v4.8H, v3.8H, v4.8H // ............................*................................................................. + sqrdmulh v12.8H, v28.8H, v0.H[5] // .........................................*.................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v26.8H, v26.8H, v1.H[1] // ................*............................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqdmulh v6.8H, v20.8H, v7.H[1] // ................................................*............................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v21.8H, v21.8H, v1.H[3] // .....................*........................................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v17.8H, v26.8H, v7.H[0] // .................*............................................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + srshr v25.8H, v6.8H, #11 // .................................................*............................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v26.8H, v4.8H, v0.H[2] // ..............................*............................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v16.8H, v21.8H, v7.H[0] // ......................*....................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v21.8H, v10.8H, v17.8H // .................................*............................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v3.8H, v28.8H, v0.H[4] // ........................................*..................................................... + add v14.8H, v10.8H, v17.8H // ..................................*........................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + ldr q10, [x0, #272] // ....e......................................................................................... + mul v17.8H, v21.8H, v0.H[2] // ...................................*.......................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v8.8H, v16.8H, v11.8H // ...........................................*.................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v22.8H, v21.8H, v0.H[3] // ....................................*......................................................... + add v16.8H, v16.8H, v11.8H // ............................................*................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v21.8H, v8.8H, v0.H[4] // .............................................*................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v19.8H, v14.8H, v16.8H // ...........................................................*.................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + add v16.8H, v14.8H, v16.8H // ............................................................*................................. + sqrdmulh v31.8H, v8.8H, v0.H[5] // ..............................................*............................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqdmulh v11.8H, v13.8H, v7.H[1] // ...................................................*.......................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v20.8H, v25.8H, v7.H[0] // ..................................................*........................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v4.8H, v4.8H, v0.H[3] // ...............................*.............................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + srshr v2.8H, v11.8H, #11 // ....................................................*......................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v3.8H, v12.8H, v7.H[0] // ..........................................*................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v13.8H, v2.8H, v7.H[0] // .....................................................*........................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v11.8H, v19.8H, v0.H[1] // ..............................................................*............................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v26.8H, v4.8H, v7.H[0] // ................................*............................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v21.8H, v31.8H, v7.H[0] // ...............................................*.............................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v2.8H, v20.8H, v13.8H // ......................................................*....................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + add v23.8H, v20.8H, v13.8H // .......................................................*...................................... + mls v17.8H, v22.8H, v7.H[0] // .....................................*........................................................ + // gap // .............................................................................................. + sub v20.8H, v26.8H, v3.8H // ................................................................*............................. + // gap // .............................................................................................. + // gap // .............................................................................................. + add v26.8H, v26.8H, v3.8H // .................................................................*............................ + mul v25.8H, v19.8H, v0.H[0] // .............................................................*................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v5.8H, v2.8H, v0.H[0] // ........................................................*..................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v9.8H, v17.8H, v21.8H // .....................................................................*........................ + // gap // .............................................................................................. + // gap // .............................................................................................. + add v21.8H, v17.8H, v21.8H // ......................................................................*....................... + // gap // .............................................................................................. + sqrdmulh v6.8H, v2.8H, v0.H[1] // .........................................................*.................................... + // gap // .............................................................................................. + ldr q17, [x0, #464] // .......e...................................................................................... + // gap // .............................................................................................. + sqrdmulh v19.8H, v16.8H, v30.8H // ..................................................................................*........... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v8.8H, v26.8H, v30.8H // .....................................................................................*........ + // gap // .............................................................................................. + sub v12.8H, v27.8H, v17.8H // .......................e...................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v5.8H, v6.8H, v7.H[0] // ..........................................................*................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v25.8H, v11.8H, v7.H[0] // ...............................................................*.............................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v4.8H, v20.8H, v0.H[0] // ..................................................................*........................... + // gap // .............................................................................................. + // gap // .............................................................................................. + str q5, [x0, #256] // ..........................................................................*................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v20.8H, v20.8H, v0.H[1] // ...................................................................*.......................... + // gap // .............................................................................................. + // gap // .............................................................................................. + str q25, [x0, #320] // ...........................................................................*.................. + // gap // .............................................................................................. + // gap // .............................................................................................. + ldr q25, [x0, #208] // ...e.......................................................................................... + mul v11.8H, v9.8H, v0.H[0] // .......................................................................*...................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v2.8H, v9.8H, v0.H[1] // ........................................................................*..................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v4.8H, v20.8H, v7.H[0] // ....................................................................*......................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v20.8H, v23.8H, v29.8H // ..............................................................................*............... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v11.8H, v2.8H, v7.H[0] // .........................................................................*.................... + // gap // .............................................................................................. + str q4, [x0, #384] // ............................................................................*................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v2.8H, v16.8H, v29.8H // .................................................................................*............ + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v2.8H, v19.8H, v7.H[0] // ...................................................................................*.......... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + str q11, [x0, #448] // .............................................................................*................ + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v9.8H, v23.8H, v30.8H // ...............................................................................*.............. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v23.8H, v26.8H, v29.8H // ....................................................................................*......... + // gap // .............................................................................................. + // gap // .............................................................................................. + str q2, [x0, #64] // ...........................................................................................*.. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v16.8H, v21.8H, v30.8H // ........................................................................................*..... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v2.8H, v21.8H, v29.8H // .......................................................................................*...... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v20.8H, v9.8H, v7.H[0] // ................................................................................*............. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v2.8H, v16.8H, v7.H[0] // .........................................................................................*.... + ldr q16, [x0, #16] // e............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v23.8H, v8.8H, v7.H[0] // ......................................................................................*....... + // gap // .............................................................................................. + // gap // .............................................................................................. + str q20, [x0], #(16) // ..........................................................................................*... + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v14.8H, v12.8H, v1.H[5] // ..........................e................................................................... + sub v8.8H, v16.8H, v18.8H // ........e..................................................................................... + // gap // .............................................................................................. + add v20.8H, v27.8H, v17.8H // ........................e..................................................................... + ldr q17, [x0, #128] // ..e........................................................................................... + // gap // .............................................................................................. + mul v11.8H, v12.8H, v1.H[4] // .........................e.................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + str q2, [x0, #176] // .............................................................................................* + // gap // .............................................................................................. + sqrdmulh v19.8H, v8.8H, v0.H[7] // ...........e.................................................................................. + str q23, [x0, #112] // ............................................................................................*. + add v3.8H, v16.8H, v18.8H // .........e.................................................................................... + + // original source code + // ldr q8, [x0, #0] // ................................................................................e...........|.................................................................................e.......... + // ldr q9, [x0, #(1*(512/8))] // e...........................................................................................|.e.......................................................................................... + // ldr q10, [x0, #(2*(512/8))] // ......................................................................................e.....|.......................................................................................e.... + // ldr q11, [x0, #(3*(512/8))] // ...............................................................e............................|................................................................e........................... + // ldr q12, [x0, #(4*(512/8))] // .........................e..................................................................|..........................e................................................................. + // ldr q13, [x0, #(5*(512/8))] // ............................................................................................|*........................................................................................... + // ldr q14, [x0, #(6*(512/8))] // ...e........................................................................................|....e....................................................................................... + // ldr q15, [x0, #(7*(512/8))] // .....................................................e......................................|......................................................e..................................... + // sub v24.8h, v8.8h, v9.8h // ....................................................................................e.......|.....................................................................................e...... + // add v8.8h, v8.8h, v9.8h // ...........................................................................................e|............................................................................................ + // mul v9.8h, v24.8h, v0.h[6] // .......*....................................................................................|........*................................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // .........................................................................................e..|..........................................................................................e. + // mls v9.8h, v24.8h, v7.h[0] // ........*...................................................................................|.........*.................................................................................. + // sub v24.8h, v10.8h, v11.8h // ............................................................................................*............................................................................................ + // add v10.8h, v10.8h, v11.8h // .*..........................................................................................|..*......................................................................................... + // mul v11.8h, v24.8h, v1.h[0] // ....*.......................................................................................|.....*...................................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ...............*............................................................................|................*........................................................................... + // mls v11.8h, v24.8h, v7.h[0] // ..................*.........................................................................|...................*........................................................................ + // sub v24.8h, v12.8h, v13.8h // .....*......................................................................................|......*..................................................................................... + // add v12.8h, v12.8h, v13.8h // ......*.....................................................................................|.......*.................................................................................... + // mul v13.8h, v24.8h, v1.h[2] // ...........*................................................................................|............*............................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // .................*..........................................................................|..................*......................................................................... + // mls v13.8h, v24.8h, v7.h[0] // .....................*......................................................................|......................*..................................................................... + // sub v24.8h, v14.8h, v15.8h // ........................................................e...................................|.........................................................e.................................. + // add v14.8h, v14.8h, v15.8h // .....................................................................................e......|......................................................................................e..... + // mul v15.8h, v24.8h, v1.h[4] // .......................................................................................e....|........................................................................................e... + // sqrdmulh v24.8h, v24.8h, v1.h[5] // ...................................................................................e........|....................................................................................e....... + // mls v15.8h, v24.8h, v7.h[0] // ..*.........................................................................................|...*........................................................................................ + // sub v24.8h, v8.8h, v10.8h // .............*..............................................................................|..............*............................................................................. + // add v8.8h, v8.8h, v10.8h // ............*...............................................................................|.............*.............................................................................. + // mul v10.8h, v24.8h, v0.h[2] // ....................*.......................................................................|.....................*...................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ....................................*.......................................................|.....................................*...................................................... + // mls v10.8h, v24.8h, v7.h[0] // .........................................*..................................................|..........................................*................................................. + // sub v24.8h, v9.8h, v11.8h // ......................*.....................................................................|.......................*.................................................................... + // add v9.8h, v9.8h, v11.8h // ........................*...................................................................|.........................*.................................................................. + // mul v11.8h, v24.8h, v0.h[2] // ..........................*.................................................................|...........................*................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ............................*...............................................................|.............................*.............................................................. + // mls v11.8h, v24.8h, v7.h[0] // .............................................*..............................................|..............................................*............................................. + // sub v24.8h, v12.8h, v14.8h // .........*..................................................................................|..........*................................................................................. + // add v12.8h, v12.8h, v14.8h // ..........*.................................................................................|...........*................................................................................ + // mul v14.8h, v24.8h, v0.h[4] // .......................*....................................................................|........................*................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..............*.............................................................................|...............*............................................................................ + // mls v14.8h, v24.8h, v7.h[0] // ......................................*.....................................................|.......................................*.................................................... + // sub v24.8h, v13.8h, v15.8h // ...........................*................................................................|............................*............................................................... + // add v13.8h, v13.8h, v15.8h // .............................*..............................................................|..............................*............................................................. + // mul v15.8h, v24.8h, v0.h[4] // ..............................*.............................................................|...............................*............................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .................................*..........................................................|..................................*......................................................... + // mls v15.8h, v24.8h, v7.h[0] // ..........................................*.................................................|...........................................*................................................ + // sqdmulh v25.8h, v8.8h, v7.h[1] // ................*...........................................................................|.................*.......................................................................... + // srshr v25.8h, v25.8h, #11 // ...................*........................................................................|....................*....................................................................... + // mls v8.8h, v25.8h, v7.h[0] // ...................................*........................................................|....................................*....................................................... + // sqdmulh v25.8h, v12.8h, v7.h[1] // ..................................*.........................................................|...................................*........................................................ + // srshr v25.8h, v25.8h, #11 // .....................................*......................................................|......................................*..................................................... + // mls v12.8h, v25.8h, v7.h[0] // .......................................*....................................................|........................................*................................................... + // sub v24.8h, v8.8h, v12.8h // ...........................................*................................................|............................................*............................................... + // add v8.8h, v8.8h, v12.8h // ............................................*...............................................|.............................................*.............................................. + // mul v12.8h, v24.8h, v0.h[0] // .................................................*..........................................|..................................................*......................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ....................................................*.......................................|.....................................................*...................................... + // mls v12.8h, v24.8h, v7.h[0] // .........................................................*..................................|..........................................................*................................. + // sub v24.8h, v9.8h, v13.8h // ...............................*............................................................|................................*........................................................... + // add v9.8h, v9.8h, v13.8h // ................................*...........................................................|.................................*.......................................................... + // mul v13.8h, v24.8h, v0.h[0] // ................................................*...........................................|.................................................*.......................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................................*...................................................|.........................................*.................................................. + // mls v13.8h, v24.8h, v7.h[0] // ..........................................................*.................................|...........................................................*................................ + // sub v24.8h, v10.8h, v14.8h // ..............................................*.............................................|...............................................*............................................ + // add v10.8h, v10.8h, v14.8h // ...............................................*............................................|................................................*........................................... + // mul v14.8h, v24.8h, v0.h[0] // ...........................................................*................................|............................................................*............................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .............................................................*..............................|..............................................................*............................. + // mls v14.8h, v24.8h, v7.h[0] // ..................................................................*.........................|...................................................................*........................ + // sub v24.8h, v11.8h, v15.8h // ..................................................*.........................................|...................................................*........................................ + // add v11.8h, v11.8h, v15.8h // ...................................................*........................................|....................................................*....................................... + // mul v15.8h, v24.8h, v0.h[0] // ................................................................*...........................|.................................................................*.......................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .................................................................*..........................|..................................................................*......................... + // mls v15.8h, v24.8h, v7.h[0] // ....................................................................*.......................|.....................................................................*...................... + // str q12, [x0, #(4*(512/8))] // ............................................................*...............................|.............................................................*.............................. + // str q13, [x0, #(5*(512/8))] // ..............................................................*.............................|...............................................................*............................ + // str q14, [x0, #(6*(512/8))] // .....................................................................*......................|......................................................................*..................... + // str q15, [x0, #(7*(512/8))] // ........................................................................*...................|.........................................................................*.................. + // mul v12.8h, v8.8h, v29.8h // ...................................................................*........................|....................................................................*....................... + // sqrdmulh v8.8h, v8.8h, v30.8h // .........................................................................*..................|..........................................................................*................. + // mls v12.8h, v8.8h, v7.h[0] // ..............................................................................*.............|...............................................................................*............ + // mul v13.8h, v9.8h, v29.8h // ......................................................................*.....................|.......................................................................*.................... + // sqrdmulh v9.8h, v9.8h, v30.8h // ......................................................*.....................................|.......................................................*.................................... + // mls v13.8h, v9.8h, v7.h[0] // .......................................................................*....................|........................................................................*................... + // mul v14.8h, v10.8h, v29.8h // ..........................................................................*.................|...........................................................................*................ + // sqrdmulh v10.8h, v10.8h, v30.8h // .......................................................*....................................|........................................................*................................... + // mls v14.8h, v10.8h, v7.h[0] // .................................................................................*..........|..................................................................................*......... + // mul v15.8h, v11.8h, v29.8h // .............................................................................*..............|..............................................................................*............. + // sqrdmulh v11.8h, v11.8h, v30.8h // ............................................................................*...............|.............................................................................*.............. + // mls v15.8h, v11.8h, v7.h[0] // ...............................................................................*............|................................................................................*........... + // str q12, [x0], #(16) // ..................................................................................*.........|...................................................................................*........ + // str q13, [x0, #(-16 + 1*(512/8))] // ...........................................................................*................|............................................................................*............... + // str q14, [x0, #(-16 + 2*(512/8))] // ..........................................................................................*.|...........................................................................................* + // str q15, [x0, #(-16 + 3*(512/8))] // ........................................................................................*...|.........................................................................................*.. + + sub count, count, #1 + cbnz count, layer123_start + sub v15.8H, v17.8H, v25.8H // *............................................................................... + ldr q2, [x0, #320] // .*.............................................................................. + mul v21.8H, v8.8H, v0.H[6] // .......*........................................................................ + add v31.8H, v17.8H, v25.8H // ..*............................................................................. + // gap // ................................................................................ + // gap // ................................................................................ + mls v21.8H, v19.8H, v7.H[0] // ........*....................................................................... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mul v4.8H, v15.8H, v1.H[0] // ....*........................................................................... + sub v13.8H, v10.8H, v2.8H // .....*.......................................................................... + // gap // ................................................................................ + add v26.8H, v3.8H, v31.8H // ............*................................................................... + // gap // ................................................................................ + // gap // ................................................................................ + sqrdmulh v23.8H, v15.8H, v1.H[1] // ...............*................................................................ + add v2.8H, v10.8H, v2.8H // ......*......................................................................... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + sqdmulh v17.8H, v26.8H, v7.H[1] // ................*............................................................... + // gap // ................................................................................ + // gap // ................................................................................ + sub v25.8H, v2.8H, v20.8H // .........*...................................................................... + // gap // ................................................................................ + // gap // ................................................................................ + mul v16.8H, v13.8H, v1.H[2] // ...........*.................................................................... + add v20.8H, v2.8H, v20.8H // ..........*..................................................................... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mls v4.8H, v23.8H, v7.H[0] // ..................*............................................................. + // gap // ................................................................................ + // gap // ................................................................................ + srshr v17.8H, v17.8H, #11 // ...................*............................................................ + // gap // ................................................................................ + // gap // ................................................................................ + sqrdmulh v2.8H, v13.8H, v1.H[3] // .................*.............................................................. + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mls v26.8H, v17.8H, v7.H[0] // ..................................*............................................. + // gap // ................................................................................ + // gap // ................................................................................ + sub v13.8H, v21.8H, v4.8H // ......................*......................................................... + // gap // ................................................................................ + // gap // ................................................................................ + sqrdmulh v17.8H, v25.8H, v0.H[5] // ..............*................................................................. + // gap // ................................................................................ + add v21.8H, v21.8H, v4.8H // ........................*....................................................... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mul v25.8H, v25.8H, v0.H[4] // .......................*........................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mls v11.8H, v14.8H, v7.H[0] // ...*............................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mls v16.8H, v2.8H, v7.H[0] // .....................*.......................................................... + sub v2.8H, v3.8H, v31.8H // .............*.................................................................. + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mul v3.8H, v13.8H, v0.H[2] // .........................*...................................................... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + sqrdmulh v13.8H, v13.8H, v0.H[3] // ...........................*.................................................... + // gap // ................................................................................ + // gap // ................................................................................ + sub v23.8H, v16.8H, v11.8H // ..........................*..................................................... + // gap // ................................................................................ + // gap // ................................................................................ + add v16.8H, v16.8H, v11.8H // ............................*................................................... + mul v11.8H, v2.8H, v0.H[2] // ....................*........................................................... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mul v10.8H, v23.8H, v0.H[4] // .............................*.................................................. + // gap // ................................................................................ + // gap // ................................................................................ + sub v4.8H, v21.8H, v16.8H // ..............................*................................................. + add v16.8H, v21.8H, v16.8H // ...............................*................................................ + // gap // ................................................................................ + sqdmulh v21.8H, v20.8H, v7.H[1] // .................................*.............................................. + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + sqrdmulh v23.8H, v23.8H, v0.H[5] // ................................*............................................... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + sqrdmulh v2.8H, v2.8H, v0.H[3] // ...................................*............................................ + srshr v21.8H, v21.8H, #11 // ....................................*........................................... + // gap // ................................................................................ + // gap // ................................................................................ + mls v25.8H, v17.8H, v7.H[0] // .....................................*.......................................... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mls v20.8H, v21.8H, v7.H[0] // ......................................*......................................... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mls v11.8H, v2.8H, v7.H[0] // ........................................*....................................... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mls v10.8H, v23.8H, v7.H[0] // .........................................*...................................... + // gap // ................................................................................ + // gap // ................................................................................ + sub v21.8H, v26.8H, v20.8H // ..........................................*..................................... + // gap // ................................................................................ + // gap // ................................................................................ + add v26.8H, v26.8H, v20.8H // ...........................................*.................................... + // gap // ................................................................................ + sqrdmulh v20.8H, v4.8H, v0.H[1] // .......................................*........................................ + sub v2.8H, v11.8H, v25.8H // .............................................*.................................. + // gap // ................................................................................ + // gap // ................................................................................ + mls v3.8H, v13.8H, v7.H[0] // ............................................*................................... + add v17.8H, v11.8H, v25.8H // ..............................................*................................. + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mul v23.8H, v4.8H, v0.H[0] // ...............................................*................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mul v4.8H, v21.8H, v0.H[0] // ................................................*............................... + // gap // ................................................................................ + // gap // ................................................................................ + sub v25.8H, v3.8H, v10.8H // .................................................*.............................. + // gap // ................................................................................ + // gap // ................................................................................ + add v11.8H, v3.8H, v10.8H // ..................................................*............................. + sqrdmulh v21.8H, v21.8H, v0.H[1] // ...................................................*............................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + sqrdmulh v13.8H, v16.8H, v30.8H // ....................................................*........................... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + sqrdmulh v3.8H, v17.8H, v30.8H // .....................................................*.......................... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mls v4.8H, v21.8H, v7.H[0] // ......................................................*......................... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mls v23.8H, v20.8H, v7.H[0] // .......................................................*........................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mul v21.8H, v2.8H, v0.H[0] // ........................................................*....................... + str q4, [x0, #256] // .........................................................*...................... + // gap // ................................................................................ + // gap // ................................................................................ + sqrdmulh v2.8H, v2.8H, v0.H[1] // ..........................................................*..................... + // gap // ................................................................................ + // gap // ................................................................................ + str q23, [x0, #320] // ...........................................................*.................... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mul v23.8H, v25.8H, v0.H[0] // ............................................................*................... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + sqrdmulh v20.8H, v25.8H, v0.H[1] // .............................................................*.................. + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mls v21.8H, v2.8H, v7.H[0] // ..............................................................*................. + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mul v2.8H, v26.8H, v29.8H // ...............................................................*................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mls v23.8H, v20.8H, v7.H[0] // ................................................................*............... + // gap // ................................................................................ + // gap // ................................................................................ + str q21, [x0, #384] // .................................................................*.............. + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mul v16.8H, v16.8H, v29.8H // ..................................................................*............. + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mls v16.8H, v13.8H, v7.H[0] // ...................................................................*............ + // gap // ................................................................................ + // gap // ................................................................................ + str q23, [x0, #448] // ....................................................................*........... + // gap // ................................................................................ + // gap // ................................................................................ + sqrdmulh v23.8H, v26.8H, v30.8H // .....................................................................*.......... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mul v21.8H, v17.8H, v29.8H // ......................................................................*......... + // gap // ................................................................................ + // gap // ................................................................................ + str q16, [x0, #64] // .......................................................................*........ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + sqrdmulh v16.8H, v11.8H, v30.8H // ........................................................................*....... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mul v26.8H, v11.8H, v29.8H // .........................................................................*...... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mls v2.8H, v23.8H, v7.H[0] // ..........................................................................*..... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mls v26.8H, v16.8H, v7.H[0] // ...........................................................................*.... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + mls v21.8H, v3.8H, v7.H[0] // ............................................................................*... + // gap // ................................................................................ + // gap // ................................................................................ + str q2, [x0], #(16) // .............................................................................*.. + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + str q26, [x0, #176] // ..............................................................................*. + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + str q21, [x0, #112] // ...............................................................................* + // gap // ................................................................................ + // gap // ................................................................................ + + // original source code + // sub v26.8H, v17.8H, v25.8H // *............................................................................... + // ldr q5, [x0, #320] // .*.............................................................................. + // add v4.8H, v17.8H, v25.8H // ...*............................................................................ + // mls v11.8H, v14.8H, v7.H[0] // ......................*......................................................... + // mul v17.8H, v26.8H, v1.H[0] // .....*.......................................................................... + // sub v21.8H, v10.8H, v5.8H // ......*......................................................................... + // add v23.8H, v10.8H, v5.8H // .........*...................................................................... + // mul v10.8H, v8.8H, v0.H[6] // ..*............................................................................. + // mls v10.8H, v19.8H, v7.H[0] // ....*........................................................................... + // sub v28.8H, v23.8H, v20.8H // ...........*.................................................................... + // add v13.8H, v23.8H, v20.8H // .............*.................................................................. + // mul v16.8H, v21.8H, v1.H[2] // ............*................................................................... + // add v20.8H, v3.8H, v4.8H // .......*........................................................................ + // sub v4.8H, v3.8H, v4.8H // ........................*....................................................... + // sqrdmulh v12.8H, v28.8H, v0.H[5] // ...................*............................................................ + // sqrdmulh v26.8H, v26.8H, v1.H[1] // ........*....................................................................... + // sqdmulh v6.8H, v20.8H, v7.H[1] // ..........*..................................................................... + // sqrdmulh v21.8H, v21.8H, v1.H[3] // ................*............................................................... + // mls v17.8H, v26.8H, v7.H[0] // ..............*................................................................. + // srshr v25.8H, v6.8H, #11 // ...............*................................................................ + // mul v26.8H, v4.8H, v0.H[2] // .............................*.................................................. + // mls v16.8H, v21.8H, v7.H[0] // .......................*........................................................ + // sub v21.8H, v10.8H, v17.8H // ..................*............................................................. + // mul v3.8H, v28.8H, v0.H[4] // .....................*.......................................................... + // add v14.8H, v10.8H, v17.8H // ....................*........................................................... + // mul v17.8H, v21.8H, v0.H[2] // .........................*...................................................... + // sub v8.8H, v16.8H, v11.8H // ...........................*.................................................... + // sqrdmulh v22.8H, v21.8H, v0.H[3] // ..........................*..................................................... + // add v16.8H, v16.8H, v11.8H // ............................*................................................... + // mul v21.8H, v8.8H, v0.H[4] // ..............................*................................................. + // sub v19.8H, v14.8H, v16.8H // ...............................*................................................ + // add v16.8H, v14.8H, v16.8H // ................................*............................................... + // sqrdmulh v31.8H, v8.8H, v0.H[5] // ..................................*............................................. + // sqdmulh v11.8H, v13.8H, v7.H[1] // .................................*.............................................. + // mls v20.8H, v25.8H, v7.H[0] // .................*.............................................................. + // sqrdmulh v4.8H, v4.8H, v0.H[3] // ...................................*............................................ + // srshr v2.8H, v11.8H, #11 // ....................................*........................................... + // mls v3.8H, v12.8H, v7.H[0] // .....................................*.......................................... + // mls v13.8H, v2.8H, v7.H[0] // ......................................*......................................... + // sqrdmulh v11.8H, v19.8H, v0.H[1] // ...........................................*.................................... + // mls v26.8H, v4.8H, v7.H[0] // .......................................*........................................ + // mls v21.8H, v31.8H, v7.H[0] // ........................................*....................................... + // sub v2.8H, v20.8H, v13.8H // .........................................*...................................... + // add v23.8H, v20.8H, v13.8H // ..........................................*..................................... + // mls v17.8H, v22.8H, v7.H[0] // .............................................*.................................. + // sub v20.8H, v26.8H, v3.8H // ............................................*................................... + // add v26.8H, v26.8H, v3.8H // ..............................................*................................. + // mul v25.8H, v19.8H, v0.H[0] // ...............................................*................................ + // mul v5.8H, v2.8H, v0.H[0] // ................................................*............................... + // sub v9.8H, v17.8H, v21.8H // .................................................*.............................. + // add v21.8H, v17.8H, v21.8H // ..................................................*............................. + // sqrdmulh v6.8H, v2.8H, v0.H[1] // ...................................................*............................ + // sqrdmulh v19.8H, v16.8H, v30.8H // ....................................................*........................... + // sqrdmulh v8.8H, v26.8H, v30.8H // .....................................................*.......................... + // mls v5.8H, v6.8H, v7.H[0] // ......................................................*......................... + // mls v25.8H, v11.8H, v7.H[0] // .......................................................*........................ + // mul v4.8H, v20.8H, v0.H[0] // ........................................................*....................... + // str q5, [x0, #256] // .........................................................*...................... + // sqrdmulh v20.8H, v20.8H, v0.H[1] // ..........................................................*..................... + // str q25, [x0, #320] // ...........................................................*.................... + // mul v11.8H, v9.8H, v0.H[0] // ............................................................*................... + // sqrdmulh v2.8H, v9.8H, v0.H[1] // .............................................................*.................. + // mls v4.8H, v20.8H, v7.H[0] // ..............................................................*................. + // mul v20.8H, v23.8H, v29.8H // ...............................................................*................ + // mls v11.8H, v2.8H, v7.H[0] // ................................................................*............... + // str q4, [x0, #384] // .................................................................*.............. + // mul v2.8H, v16.8H, v29.8H // ..................................................................*............. + // mls v2.8H, v19.8H, v7.H[0] // ...................................................................*............ + // str q11, [x0, #448] // ....................................................................*........... + // sqrdmulh v9.8H, v23.8H, v30.8H // .....................................................................*.......... + // mul v23.8H, v26.8H, v29.8H // ......................................................................*......... + // str q2, [x0, #64] // .......................................................................*........ + // sqrdmulh v16.8H, v21.8H, v30.8H // ........................................................................*....... + // mul v2.8H, v21.8H, v29.8H // .........................................................................*...... + // mls v20.8H, v9.8H, v7.H[0] // ..........................................................................*..... + // mls v2.8H, v16.8H, v7.H[0] // ...........................................................................*.... + // mls v23.8H, v8.8H, v7.H[0] // ............................................................................*... + // str q20, [x0], #(16) // .............................................................................*.. + // str q2, [x0, #176] // ..............................................................................*. + // str q23, [x0, #112] // ...............................................................................* + + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_m1_firestorm.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_m1_firestorm.s new file mode 100644 index 0000000..e46bc5d --- /dev/null +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_m1_firestorm.s @@ -0,0 +1,1728 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_opt_m1_firestorm + .global _intt_kyber_123_4567_opt_m1_firestorm + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_opt_m1_firestorm: +_intt_kyber_123_4567_opt_m1_firestorm: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 + ldr q29, [x1, #16] // ...*................................... + ldr q13, [x1, #0] // ....*.................................. + ldr q3, [x1, #32] // .*..................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + ldr q8, [x1, #48] // *...................................... + ldr q20, [x4, #80] // ......*................................ + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + ldr q28, [x4, #16] // .........*............................. + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + ldr q1, [x4, #48] // .................*..................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + trn1 v30.4S, v13.4S, v29.4S // ..........*............................ + trn2 v29.4S, v13.4S, v29.4S // ...........*........................... + ldr q13, [x4, #64] // .....*................................. + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + trn2 v10.4S, v3.4S, v8.4S // .......*............................... + trn1 v9.4S, v3.4S, v8.4S // ........*.............................. + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + trn1 v26.2D, v29.2D, v10.2D // ...............*....................... + trn2 v24.2D, v30.2D, v9.2D // ............*.......................... + trn2 v21.2D, v29.2D, v10.2D // ..............*........................ + trn1 v10.2D, v30.2D, v9.2D // .............*......................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + ldr q29, [x4, #32] // ................*...................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + add v15.8H, v24.8H, v21.8H // ....................*.................. + sub v4.8H, v24.8H, v21.8H // ..................*.................... + ldr q24, [x4], #(6*16) // ..*.................................... + sub v2.8H, v10.8H, v26.8H // ...................*................... + add v26.8H, v10.8H, v26.8H // .....................*................. + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + sqrdmulh v31.8H, v2.8H, v1.8H // .........................*............. + mul v11.8H, v4.8H, v13.8H // ......................*................ + sqrdmulh v20.8H, v4.8H, v20.8H // .......................*............... + mul v10.8H, v2.8H, v29.8H // ........................*.............. + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + add v0.8H, v26.8H, v15.8H // ...................................*... + sub v18.8H, v26.8H, v15.8H // ..........................*............ + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + mls v11.8H, v20.8H, v7.H[0] // ...........................*........... + mls v10.8H, v31.8H, v7.H[0] // ..............................*........ + mul v29.8H, v18.8H, v24.8H // ............................*.......... + sqrdmulh v3.8H, v18.8H, v28.8H // .............................*......... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + sub v25.8H, v10.8H, v11.8H // ................................*...... + add v11.8H, v10.8H, v11.8H // ....................................*.. + mls v29.8H, v3.8H, v7.H[0] // ...............................*....... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + trn1 v8.4S, v0.4S, v11.4S // ......................................* + sqrdmulh v31.8H, v25.8H, v28.8H // .................................*..... + mul v26.8H, v25.8H, v24.8H // ..................................*.... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + mls v26.8H, v31.8H, v7.H[0] // .....................................*. + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + + // original source code + // ldr q23, [x1, #48] // ...*................................... + // ldr q3, [x1, #32] // ..*.................................... + // ldr q18, [x4], #(6*16) // ...................*................... + // ldr q1, [x1, #16] // *...................................... + // ldr q9, [x1, #0] // .*..................................... + // ldr q31, [x4, #-32] // .........*............................. + // ldr q5, [x4, #-16] // ....*.................................. + // trn2 v20.4S, v3.4S, v23.4S // ..........*............................ + // trn1 v30.4S, v3.4S, v23.4S // ...........*........................... + // ldr q13, [x4, #-80] // .....*................................. + // trn1 v4.4S, v9.4S, v1.4S // .......*............................... + // trn2 v9.4S, v9.4S, v1.4S // ........*.............................. + // trn2 v3.2D, v4.2D, v30.2D // .............*......................... + // trn1 v24.2D, v4.2D, v30.2D // ...............*....................... + // trn2 v29.2D, v9.2D, v20.2D // ..............*........................ + // trn1 v27.2D, v9.2D, v20.2D // ............*.......................... + // ldr q4, [x4, #-64] // ................*...................... + // ldr q9, [x4, #-48] // ......*................................ + // sub v25.8H, v3.8H, v29.8H // ..................*.................... + // sub v30.8H, v24.8H, v27.8H // ....................*.................. + // add v19.8H, v3.8H, v29.8H // .................*..................... + // add v11.8H, v24.8H, v27.8H // .....................*................. + // mul v10.8H, v25.8H, v31.8H // .......................*............... + // sqrdmulh v8.8H, v25.8H, v5.8H // ........................*.............. + // mul v3.8H, v30.8H, v4.8H // .........................*............. + // sqrdmulh v4.8H, v30.8H, v9.8H // ......................*................ + // sub v5.8H, v11.8H, v19.8H // ...........................*........... + // mls v10.8H, v8.8H, v7.H[0] // ............................*.......... + // mul v29.8H, v5.8H, v18.8H // ..............................*........ + // sqrdmulh v21.8H, v5.8H, v13.8H // ...............................*....... + // mls v3.8H, v4.8H, v7.H[0] // .............................*......... + // mls v29.8H, v21.8H, v7.H[0] // ..................................*.... + // sub v4.8H, v3.8H, v10.8H // ................................*...... + // sqrdmulh v2.8H, v4.8H, v13.8H // ....................................*.. + // mul v26.8H, v4.8H, v18.8H // .....................................*. + // add v0.8H, v11.8H, v19.8H // ..........................*............ + // add v11.8H, v3.8H, v10.8H // .................................*..... + // mls v26.8H, v2.8H, v7.H[0] // ......................................* + // trn1 v8.4S, v0.4S, v11.4S // ...................................*... + + sub count, count, #1 +layer4567_start: + ldr q15, [x3], #16 // ..............................................*.............................. + trn2 v2.4S, v0.4S, v11.4S // .......................................*..................................... + trn1 v19.4S, v29.4S, v26.4S // ........................................*.................................... + ldr q23, [x1, #112] // ...e......................................................................... + // gap // ............................................................................. + // gap // ............................................................................. + ldr q3, [x1, #96] // ..e.......................................................................... + trn2 v0.4S, v29.4S, v26.4S // .........................................*................................... + ldr q18, [x4], #(6*16) // ............e................................................................ + ldr q1, [x1, #80] // .e........................................................................... + ldr q9, [x1, #64] // e............................................................................ + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + trn1 v21.2D, v2.2D, v0.2D // .............................................*............................... + trn2 v2.2D, v2.2D, v0.2D // ...........................................*................................. + trn2 v16.2D, v8.2D, v19.2D // ..........................................*.................................. + trn1 v0.2D, v8.2D, v19.2D // ............................................*................................ + ldr q31, [x4, #-32] // ................e............................................................ + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + ldr q5, [x4, #-16] // .................e........................................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + trn2 v20.4S, v3.4S, v23.4S // .......e..................................................................... + trn1 v30.4S, v3.4S, v23.4S // ......e...................................................................... + add v17.8H, v0.8H, v21.8H // ................................................*............................ + add v26.8H, v16.8H, v2.8H // .....................................................*....................... + ldr q13, [x4, #-80] // .............e............................................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + trn1 v4.4S, v9.4S, v1.4S // ....e........................................................................ + trn2 v9.4S, v9.4S, v1.4S // .....e....................................................................... + sub v21.8H, v0.8H, v21.8H // ...............................................*............................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + sub v14.8H, v16.8H, v2.8H // ....................................................*........................ + sqdmulh v2.8H, v17.8H, v7.H[1] // .........................................................*................... + sqdmulh v0.8H, v26.8H, v7.H[1] // ............................................................*................ + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + trn2 v3.2D, v4.2D, v30.2D // ........e.................................................................... + trn1 v24.2D, v4.2D, v30.2D // ..........e.................................................................. + trn2 v29.2D, v9.2D, v20.2D // .........e................................................................... + trn1 v27.2D, v9.2D, v20.2D // ...........e................................................................. + ldr q4, [x4, #-64] // ..............e.............................................................. + ldr q9, [x4, #-48] // ...............e............................................................. + // gap // ............................................................................. + // gap // ............................................................................. + sqrdmulh v16.8H, v14.8H, v15.H[5] // .......................................................*..................... + mul v20.8H, v21.8H, v15.H[2] // .................................................*........................... + sqrdmulh v21.8H, v21.8H, v15.H[3] // ..................................................*.......................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + srshr v2.8H, v2.8H, #11 // ..........................................................*.................. + srshr v0.8H, v0.8H, #11 // .............................................................*............... + sub v25.8H, v3.8H, v29.8H // .......................e..................................................... + sub v30.8H, v24.8H, v27.8H // ..................e.......................................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + mul v23.8H, v14.8H, v15.H[4] // ......................................................*...................... + add v19.8H, v3.8H, v29.8H // ........................e.................................................... + add v11.8H, v24.8H, v27.8H // ...................e......................................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + mul v10.8H, v25.8H, v31.8H // .........................e................................................... + sqrdmulh v8.8H, v25.8H, v5.8H // ..........................e.................................................. + mul v3.8H, v30.8H, v4.8H // ....................e........................................................ + sqrdmulh v4.8H, v30.8H, v9.8H // .....................e....................................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + sub v5.8H, v11.8H, v19.8H // ............................e................................................ + mls v26.8H, v0.8H, v7.H[0] // ..............................................................*.............. + mls v17.8H, v2.8H, v7.H[0] // ...........................................................*................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + mls v23.8H, v16.8H, v7.H[0] // ........................................................*.................... + mls v20.8H, v21.8H, v7.H[0] // ...................................................*......................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + mls v10.8H, v8.8H, v7.H[0] // ...........................e................................................. + mul v29.8H, v5.8H, v18.8H // ..............................e.............................................. + sqrdmulh v21.8H, v5.8H, v13.8H // ...............................e............................................. + mls v3.8H, v4.8H, v7.H[0] // ......................e...................................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + add v2.8H, v17.8H, v26.8H // ................................................................*............ + sub v16.8H, v17.8H, v26.8H // ...............................................................*............. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + sub v0.8H, v20.8H, v23.8H // ....................................................................*........ + add v9.8H, v20.8H, v23.8H // .....................................................................*....... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + mls v29.8H, v21.8H, v7.H[0] // ................................e............................................ + str q2, [x1], #(64) // .........................................................................*... + mul v21.8H, v16.8H, v15.H[0] // .................................................................*........... + sqrdmulh v16.8H, v16.8H, v15.H[1] // ..................................................................*.......... + sub v4.8H, v3.8H, v10.8H // .................................e........................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + mul v31.8H, v0.8H, v15.H[0] // ......................................................................*...... + sqrdmulh v0.8H, v0.8H, v15.H[1] // .......................................................................*..... + str q9, [x1, #-48] // ..........................................................................*.. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + sqrdmulh v2.8H, v4.8H, v13.8H // ....................................e........................................ + mul v26.8H, v4.8H, v18.8H // ...................................e......................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + mls v21.8H, v16.8H, v7.H[0] // ...................................................................*......... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + mls v31.8H, v0.8H, v7.H[0] // ........................................................................*.... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + add v0.8H, v11.8H, v19.8H // .............................e............................................... + add v11.8H, v3.8H, v10.8H // ..................................e.......................................... + mls v26.8H, v2.8H, v7.H[0] // .....................................e....................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + str q21, [x1, #-32] // ...........................................................................*. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + str q31, [x1, #-16] // ............................................................................* + trn1 v8.4S, v0.4S, v11.4S // ......................................e...................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + + // original source code + // ldr q8, [x1, #(16*0)] // .....e....................................................................|.......e................................................................... + // ldr q9, [x1, #(16*1)] // ....e.....................................................................|......e.................................................................... + // ldr q10, [x1, #(16*2)] // .e........................................................................|...e....................................................................... + // ldr q11, [x1, #(16*3)] // e.........................................................................|..e........................................................................ + // trn1 v25.4s, v8.4s, v9.4s // .................e........................................................|...................e....................................................... + // trn2 v26.4s, v8.4s, v9.4s // ..................e.......................................................|....................e...................................................... + // trn1 v27.4s, v10.4s, v11.4s // .............e............................................................|...............e........................................................... + // trn2 v28.4s, v10.4s, v11.4s // ............e.............................................................|..............e............................................................ + // trn2 v10.2d, v25.2d, v27.2d // .......................e..................................................|.........................e................................................. + // trn2 v11.2d, v26.2d, v28.2d // .........................e................................................|...........................e............................................... + // trn1 v8.2d, v25.2d, v27.2d // ........................e.................................................|..........................e................................................ + // trn1 v9.2d, v26.2d, v28.2d // ..........................e...............................................|............................e.............................................. + // ldr q0, [x4], #(6*16) // ...e......................................................................|.....e..................................................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // ................e.........................................................|..................e........................................................ + // ldr q1, [x4, #(-6*16 + 2*16)] // ...........................e..............................................|.............................e............................................. + // ldr q5, [x4, #(-6*16 + 3*16)] // ............................e.............................................|..............................e............................................ + // ldr q2, [x4, #(-6*16 + 4*16)] // ..........e...............................................................|............e.............................................................. + // ldr q6, [x4, #(-6*16 + 5*16)] // ...........e..............................................................|.............e............................................................. + // sub v24.8h, v8.8h, v9.8h // ...................................e......................................|.....................................e..................................... + // add v8.8h, v8.8h, v9.8h // ......................................e...................................|........................................e.................................. + // mul v9.8h, v24.8h, v1.8h // .........................................e................................|...........................................e............................... + // sqrdmulh v24.8h, v24.8h, v5.8h // ..........................................e...............................|............................................e.............................. + // mls v9.8h, v24.8h, v7.h[0] // ...................................................e......................|.....................................................e..................... + // sub v24.8h, v10.8h, v11.8h // ..................................e.......................................|....................................e...................................... + // add v10.8h, v10.8h, v11.8h // .....................................e....................................|.......................................e................................... + // mul v11.8h, v24.8h, v2.8h // .......................................e..................................|.........................................e................................. + // sqrdmulh v24.8h, v24.8h, v6.8h // ........................................e.................................|..........................................e................................ + // mls v11.8h, v24.8h, v7.h[0] // ................................................e.........................|..................................................e........................ + // sub v24.8h, v8.8h, v10.8h // ...........................................e..............................|.............................................e............................. + // add v8.8h, v8.8h, v10.8h // ....................................................................e.....|......................................................................e.... + // mul v10.8h, v24.8h, v0.8h // .................................................e........................|...................................................e....................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ..................................................e.......................|....................................................e...................... + // mls v10.8h, v24.8h, v7.h[0] // ........................................................e.................|..........................................................e................ + // sub v24.8h, v9.8h, v11.8h // ............................................................e.............|..............................................................e............ + // add v9.8h, v9.8h, v11.8h // .....................................................................e....|.......................................................................e... + // mul v11.8h, v24.8h, v0.8h // .................................................................e........|...................................................................e....... + // sqrdmulh v24.8h, v24.8h, v4.8h // ................................................................e.........|..................................................................e........ + // mls v11.8h, v24.8h, v7.h[0] // ......................................................................e...|........................................................................e.. + // trn1 v25.4s, v8.4s, v9.4s // .........................................................................e|........................................................................... + // trn2 v26.4s, v8.4s, v9.4s // ..........................................................................|*.......................................................................... + // trn1 v27.4s, v10.4s, v11.4s // ..........................................................................|.*......................................................................... + // trn2 v28.4s, v10.4s, v11.4s // ..*.......................................................................|....*...................................................................... + // trn2 v10.2d, v25.2d, v27.2d // ........*.................................................................|..........*................................................................ + // trn2 v11.2d, v26.2d, v28.2d // .......*..................................................................|.........*................................................................. + // trn1 v8.2d, v25.2d, v27.2d // .........*................................................................|...........*............................................................... + // trn1 v9.2d, v26.2d, v28.2d // ......*...................................................................|........*.................................................................. + // ldr q0, [x3], #16 // ..........................................................................*........................................................................... + // sub v24.8h, v8.8h, v9.8h // ...................*......................................................|.....................*..................................................... + // add v8.8h, v8.8h, v9.8h // ..............*...........................................................|................*.......................................................... + // mul v9.8h, v24.8h, v0.h[2] // ..............................*...........................................|................................*.......................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...............................*..........................................|.................................*......................................... + // mls v9.8h, v24.8h, v7.h[0] // ...............................................*..........................|.................................................*......................... + // sub v24.8h, v10.8h, v11.8h // ....................*.....................................................|......................*.................................................... + // add v10.8h, v10.8h, v11.8h // ...............*..........................................................|.................*......................................................... + // mul v11.8h, v24.8h, v0.h[4] // ....................................*.....................................|......................................*.................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .............................*............................................|...............................*........................................... + // mls v11.8h, v24.8h, v7.h[0] // ..............................................*...........................|................................................*.......................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // .....................*....................................................|.......................*................................................... + // srshr v25.8h, v25.8h, #11 // ................................*.........................................|..................................*........................................ + // mls v8.8h, v25.8h, v7.h[0] // .............................................*............................|...............................................*........................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ......................*...................................................|........................*.................................................. + // srshr v25.8h, v25.8h, #11 // .................................*........................................|...................................*....................................... + // mls v10.8h, v25.8h, v7.h[0] // ............................................*.............................|..............................................*............................ + // sub v24.8h, v8.8h, v10.8h // .....................................................*....................|.......................................................*................... + // add v8.8h, v8.8h, v10.8h // ....................................................*.....................|......................................................*.................... + // mul v10.8h, v24.8h, v0.h[0] // ..........................................................*...............|............................................................*.............. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........................................................*..............|.............................................................*............. + // mls v10.8h, v24.8h, v7.h[0] // ..................................................................*.......|....................................................................*...... + // sub v24.8h, v9.8h, v11.8h // ......................................................*...................|........................................................*.................. + // add v9.8h, v9.8h, v11.8h // .......................................................*..................|.........................................................*................. + // mul v11.8h, v24.8h, v0.h[0] // .............................................................*............|...............................................................*........... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................................*...........|................................................................*.......... + // mls v11.8h, v24.8h, v7.h[0] // ...................................................................*......|.....................................................................*..... + // str q8, [x1], #(64) // .........................................................*................|...........................................................*............... + // str q9, [x1, #(-64 + 16*1)] // ...............................................................*..........|.................................................................*......... + // str q10, [x1, #(-64 + 16*2)] // .......................................................................*..|.........................................................................*. + // str q11, [x1, #(-64 + 16*3)] // ........................................................................*.|..........................................................................* + + sub count, count, #1 + cbnz count, layer4567_start + ldr q12, [x3], #16 // *..................................... + trn2 v22.4S, v0.4S, v11.4S // .*.................................... + trn1 v20.4S, v29.4S, v26.4S // ..*................................... + trn2 v18.4S, v29.4S, v26.4S // ...*.................................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + trn1 v14.2D, v22.2D, v18.2D // ....*................................. + trn2 v2.2D, v22.2D, v18.2D // .....*................................ + trn1 v9.2D, v8.2D, v20.2D // .......*.............................. + trn2 v1.2D, v8.2D, v20.2D // ......*............................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + add v18.8H, v9.8H, v14.8H // ........*............................. + add v28.8H, v1.8H, v2.8H // .........*............................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v17.8H, v9.8H, v14.8H // ..........*........................... + sub v20.8H, v1.8H, v2.8H // ...........*.......................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqdmulh v5.8H, v18.8H, v7.H[1] // ............*......................... + sqdmulh v26.8H, v28.8H, v7.H[1] // .............*........................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v15.8H, v20.8H, v12.H[5] // ..............*....................... + mul v13.8H, v17.8H, v12.H[2] // ...............*...................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v23.8H, v20.8H, v12.H[4] // ...................*.................. + sqrdmulh v20.8H, v17.8H, v12.H[3] // ................*..................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + srshr v25.8H, v5.8H, #11 // .................*.................... + srshr v2.8H, v26.8H, #11 // ..................*................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v23.8H, v15.8H, v7.H[0] // ......................*............... + mls v13.8H, v20.8H, v7.H[0] // .......................*.............. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v18.8H, v25.8H, v7.H[0] // .....................*................ + // gap // ...................................... + mls v28.8H, v2.8H, v7.H[0] // ....................*................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v2.8H, v13.8H, v23.8H // ..........................*........... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v8.8H, v18.8H, v28.8H // .........................*............ + add v20.8H, v13.8H, v23.8H // ...........................*.......... + add v14.8H, v18.8H, v28.8H // ........................*............. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v25.8H, v2.8H, v12.H[1] // ................................*..... + mul v2.8H, v2.8H, v12.H[0] // ...............................*...... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v21.8H, v8.8H, v12.H[1] // ..............................*....... + mul v8.8H, v8.8H, v12.H[0] // .............................*........ + str q14, [x1], #(64) // ............................*......... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q20, [x1, #-48] // .................................*.... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v2.8H, v25.8H, v7.H[0] // ...................................*.. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v8.8H, v21.8H, v7.H[0] // ..................................*... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q2, [x1, #-16] // .....................................* + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q8, [x1, #-32] // ....................................*. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + + // original source code + // ldr q15, [x3], #16 // *..................................... + // trn2 v2.4S, v0.4S, v11.4S // .*.................................... + // trn1 v19.4S, v29.4S, v26.4S // ..*................................... + // trn2 v0.4S, v29.4S, v26.4S // ...*.................................. + // trn1 v21.2D, v2.2D, v0.2D // ....*................................. + // trn2 v2.2D, v2.2D, v0.2D // .....*................................ + // trn2 v16.2D, v8.2D, v19.2D // .......*.............................. + // trn1 v0.2D, v8.2D, v19.2D // ......*............................... + // add v17.8H, v0.8H, v21.8H // ........*............................. + // add v26.8H, v16.8H, v2.8H // .........*............................ + // sub v21.8H, v0.8H, v21.8H // ..........*........................... + // sub v14.8H, v16.8H, v2.8H // ...........*.......................... + // sqdmulh v2.8H, v17.8H, v7.H[1] // ............*......................... + // sqdmulh v0.8H, v26.8H, v7.H[1] // .............*........................ + // sqrdmulh v16.8H, v14.8H, v15.H[5] // ..............*....................... + // mul v20.8H, v21.8H, v15.H[2] // ...............*...................... + // sqrdmulh v21.8H, v21.8H, v15.H[3] // .................*.................... + // srshr v2.8H, v2.8H, #11 // ..................*................... + // srshr v0.8H, v0.8H, #11 // ...................*.................. + // mul v23.8H, v14.8H, v15.H[4] // ................*..................... + // mls v26.8H, v0.8H, v7.H[0] // .......................*.............. + // mls v17.8H, v2.8H, v7.H[0] // ......................*............... + // mls v23.8H, v16.8H, v7.H[0] // ....................*................. + // mls v20.8H, v21.8H, v7.H[0] // .....................*................ + // add v2.8H, v17.8H, v26.8H // ...........................*.......... + // sub v16.8H, v17.8H, v26.8H // .........................*............ + // sub v0.8H, v20.8H, v23.8H // ........................*............. + // add v9.8H, v20.8H, v23.8H // ..........................*........... + // str q2, [x1], #(64) // ................................*..... + // mul v21.8H, v16.8H, v15.H[0] // ...............................*...... + // sqrdmulh v16.8H, v16.8H, v15.H[1] // ..............................*....... + // mul v31.8H, v0.8H, v15.H[0] // .............................*........ + // sqrdmulh v0.8H, v0.8H, v15.H[1] // ............................*......... + // str q9, [x1, #-48] // .................................*.... + // mls v21.8H, v16.8H, v7.H[0] // ...................................*.. + // mls v31.8H, v0.8H, v7.H[0] // ..................................*... + // str q21, [x1, #-32] // .....................................* + // str q31, [x1, #-16] // ....................................*. + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + ldr q16, [x0, #192] // ..*........................................................ + ldr q2, [x0, #128] // .*......................................................... + ldr q23, [x0, #448] // *.......................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + ldr q21, [x0, #384] // ...*....................................................... + ldr q26, [x0, #320] // ....*...................................................... + ldr q20, [x0, #256] // .....*..................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + ldr q17, [x0, #64] // ......*.................................................... + ldr q4, [x0, #0] // .......*................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + sub v25.8H, v2.8H, v16.8H // ........*.................................................. + add v2.8H, v2.8H, v16.8H // ..........*................................................ + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + sub v16.8H, v21.8H, v23.8H // ...........*............................................... + add v23.8H, v21.8H, v23.8H // ............*.............................................. + add v21.8H, v20.8H, v26.8H // .........*................................................. + sub v26.8H, v20.8H, v26.8H // ...............*........................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + add v20.8H, v4.8H, v17.8H // ...................*....................................... + sub v17.8H, v4.8H, v17.8H // .......................*................................... + sqrdmulh v4.8H, v25.8H, v1.H[1] // .............*............................................. + mul v25.8H, v25.8H, v1.H[0] // ..............*............................................ + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + sub v13.8H, v21.8H, v23.8H // ....................*...................................... + sqrdmulh v11.8H, v16.8H, v1.H[5] // ................*.......................................... + mul v16.8H, v16.8H, v1.H[4] // .................*......................................... + add v6.8H, v21.8H, v23.8H // ..................*........................................ + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + sub v23.8H, v20.8H, v2.8H // ..........................*................................ + add v28.8H, v20.8H, v2.8H // .........................*................................. + mul v2.8H, v26.8H, v1.H[2] // .....................*..................................... + sqrdmulh v21.8H, v26.8H, v1.H[3] // ......................*.................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + sqrdmulh v26.8H, v17.8H, v0.H[7] // ............................*.............................. + mul v20.8H, v17.8H, v0.H[6] // .............................*............................. + sqdmulh v17.8H, v6.8H, v7.H[1] // ........................*.................................. + mul v3.8H, v13.8H, v0.H[4] // ...........................*............................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + sqrdmulh v13.8H, v13.8H, v0.H[5] // .................................*......................... + sqrdmulh v10.8H, v23.8H, v0.H[3] // ..............................*............................ + mul v23.8H, v23.8H, v0.H[2] // ...............................*........................... + sqdmulh v19.8H, v28.8H, v7.H[1] // ...................................*....................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + mls v25.8H, v4.8H, v7.H[0] // .....................................*..................... + mls v16.8H, v11.8H, v7.H[0] // ....................................*...................... + mls v2.8H, v21.8H, v7.H[0] // ................................*.......................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + mls v20.8H, v26.8H, v7.H[0] // ......................................*.................... + srshr v21.8H, v17.8H, #11 // ..................................*........................ + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + mls v3.8H, v13.8H, v7.H[0] // .......................................*................... + mls v23.8H, v10.8H, v7.H[0] // ........................................*.................. + srshr v26.8H, v19.8H, #11 // ..........................................*................ + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + sub v17.8H, v2.8H, v16.8H // ...........................................*............... + add v19.8H, v2.8H, v16.8H // .............................................*............. + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + mls v6.8H, v21.8H, v7.H[0] // .........................................*................. + sub v2.8H, v20.8H, v25.8H // ..............................................*............ + add v27.8H, v20.8H, v25.8H // ............................................*.............. + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + sub v16.8H, v23.8H, v3.8H // .................................................*......... + add v23.8H, v23.8H, v3.8H // ..................................................*........ + mul v25.8H, v17.8H, v0.H[4] // ...............................................*........... + sqrdmulh v3.8H, v17.8H, v0.H[5] // ................................................*.......... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + mls v28.8H, v26.8H, v7.H[0] // ....................................................*...... + sqrdmulh v4.8H, v2.8H, v0.H[3] // .....................................................*..... + mul v21.8H, v2.8H, v0.H[2] // ......................................................*.... + sub v13.8H, v27.8H, v19.8H // ...................................................*....... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + mul v20.8H, v16.8H, v0.H[0] // .......................................................*... + sqrdmulh v16.8H, v16.8H, v0.H[1] // ........................................................*.. + mul v2.8H, v23.8H, v29.8H // .........................................................*. + sqrdmulh v17.8H, v23.8H, v30.8H // ..........................................................* + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + + // original source code + // ldr q18, [x0, #448] // ..*........................................................ + // ldr q5, [x0, #128] // .*......................................................... + // ldr q31, [x0, #192] // *.......................................................... + // ldr q19, [x0, #384] // ...*....................................................... + // ldr q6, [x0, #320] // ....*...................................................... + // ldr q24, [x0, #256] // .....*..................................................... + // ldr q11, [x0, #64] // ......*.................................................... + // ldr q12, [x0, #0] // .......*................................................... + // sub v8.8H, v5.8H, v31.8H // ........*.................................................. + // add v28.8H, v24.8H, v6.8H // ............*.............................................. + // add v22.8H, v5.8H, v31.8H // .........*................................................. + // sub v13.8H, v19.8H, v18.8H // ..........*................................................ + // add v23.8H, v19.8H, v18.8H // ...........*............................................... + // sqrdmulh v19.8H, v8.8H, v1.H[1] // ................*.......................................... + // mul v8.8H, v8.8H, v1.H[0] // .................*......................................... + // sub v27.8H, v24.8H, v6.8H // .............*............................................. + // sqrdmulh v20.8H, v13.8H, v1.H[5] // ...................*....................................... + // mul v5.8H, v13.8H, v1.H[4] // ....................*...................................... + // add v6.8H, v28.8H, v23.8H // .....................*..................................... + // add v24.8H, v12.8H, v11.8H // ..............*............................................ + // sub v18.8H, v28.8H, v23.8H // ..................*........................................ + // mul v9.8H, v27.8H, v1.H[2] // ........................*.................................. + // sqrdmulh v15.8H, v27.8H, v1.H[3] // .........................*................................. + // sub v11.8H, v12.8H, v11.8H // ...............*........................................... + // sqdmulh v31.8H, v6.8H, v7.H[1] // ............................*.............................. + // add v28.8H, v24.8H, v22.8H // .......................*................................... + // sub v22.8H, v24.8H, v22.8H // ......................*.................................... + // mul v13.8H, v18.8H, v0.H[4] // .............................*............................. + // sqrdmulh v25.8H, v11.8H, v0.H[7] // ..........................*................................ + // mul v12.8H, v11.8H, v0.H[6] // ...........................*............................... + // sqrdmulh v10.8H, v22.8H, v0.H[3] // ...............................*........................... + // mul v24.8H, v22.8H, v0.H[2] // ................................*.......................... + // mls v9.8H, v15.8H, v7.H[0] // ....................................*...................... + // sqrdmulh v21.8H, v18.8H, v0.H[5] // ..............................*............................ + // srshr v23.8H, v31.8H, #11 // ......................................*.................... + // sqdmulh v18.8H, v28.8H, v7.H[1] // .................................*......................... + // mls v5.8H, v20.8H, v7.H[0] // ...................................*....................... + // mls v8.8H, v19.8H, v7.H[0] // ..................................*........................ + // mls v12.8H, v25.8H, v7.H[0] // .....................................*..................... + // mls v13.8H, v21.8H, v7.H[0] // .......................................*................... + // mls v24.8H, v10.8H, v7.H[0] // ........................................*.................. + // mls v6.8H, v23.8H, v7.H[0] // ............................................*.............. + // srshr v21.8H, v18.8H, #11 // .........................................*................. + // sub v20.8H, v9.8H, v5.8H // ..........................................*................ + // add v27.8H, v12.8H, v8.8H // ..............................................*............ + // add v19.8H, v9.8H, v5.8H // ...........................................*............... + // sub v26.8H, v12.8H, v8.8H // .............................................*............. + // mul v25.8H, v20.8H, v0.H[4] // .................................................*......... + // sqrdmulh v3.8H, v20.8H, v0.H[5] // ..................................................*........ + // sub v2.8H, v24.8H, v13.8H // ...............................................*........... + // add v17.8H, v24.8H, v13.8H // ................................................*.......... + // sub v13.8H, v27.8H, v19.8H // ......................................................*.... + // mls v28.8H, v21.8H, v7.H[0] // ...................................................*....... + // sqrdmulh v4.8H, v26.8H, v0.H[3] // ....................................................*...... + // mul v21.8H, v26.8H, v0.H[2] // .....................................................*..... + // mul v20.8H, v2.8H, v0.H[0] // .......................................................*... + // sqrdmulh v16.8H, v2.8H, v0.H[1] // ........................................................*.. + // mul v2.8H, v17.8H, v29.8H // .........................................................*. + // sqrdmulh v17.8H, v17.8H, v30.8H // ..........................................................* + + sub count, count, #1 +layer123_start: + ldr q18, [x0, #464] // .......e...................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + add v26.8H, v27.8H, v19.8H // ............................................................*................................. + sqrdmulh v23.8H, v13.8H, v0.H[1] // ..............................................................*............................... + ldr q5, [x0, #144] // ..e........................................................................................... + ldr q31, [x0, #208] // ...e.......................................................................................... + mul v27.8H, v13.8H, v0.H[0] // .............................................................*................................ + add v10.8H, v28.8H, v6.8H // .......................................................*...................................... + sub v13.8H, v28.8H, v6.8H // ......................................................*....................................... + ldr q19, [x0, #400] // ......e....................................................................................... + ldr q6, [x0, #336] // .....e........................................................................................ + // gap // .............................................................................................. + ldr q24, [x0, #272] // ....e......................................................................................... + mls v25.8H, v3.8H, v7.H[0] // ...............................................*.............................................. + mls v21.8H, v4.8H, v7.H[0] // .....................................*........................................................ + mul v14.8H, v26.8H, v29.8H // .................................................................................*............ + sqrdmulh v3.8H, v26.8H, v30.8H // ..................................................................................*........... + ldr q11, [x0, #80] // .e............................................................................................ + ldr q12, [x0, #16] // e............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v20.8H, v16.8H, v7.H[0] // ....................................................................*......................... + mls v2.8H, v17.8H, v7.H[0] // ......................................................................................*....... + mul v4.8H, v10.8H, v29.8H // ..............................................................................*............... + // gap // .............................................................................................. + mul v26.8H, v13.8H, v0.H[0] // ........................................................*..................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v27.8H, v23.8H, v7.H[0] // ...............................................................*.............................. + sqrdmulh v17.8H, v10.8H, v30.8H // ...............................................................................*.............. + sub v8.8H, v5.8H, v31.8H // .............e................................................................................ + sqrdmulh v10.8H, v13.8H, v0.H[1] // .........................................................*.................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + add v16.8H, v21.8H, v25.8H // ......................................................................*....................... + str q20, [x0, #384] // ............................................................................*................. + str q2, [x0, #128] // ............................................................................................*. + add v28.8H, v24.8H, v6.8H // ...................e.......................................................................... + add v22.8H, v5.8H, v31.8H // ..............e............................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v13.8H, v19.8H, v18.8H // .......................e...................................................................... + add v23.8H, v19.8H, v18.8H // ........................e..................................................................... + sqrdmulh v19.8H, v8.8H, v1.H[1] // ................e............................................................................. + mul v8.8H, v8.8H, v1.H[0] // ...............e.............................................................................. + // gap // .............................................................................................. + str q27, [x0, #320] // ...........................................................................*.................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v27.8H, v24.8H, v6.8H // ..................e........................................................................... + sqrdmulh v2.8H, v16.8H, v30.8H // ........................................................................................*..... + sqrdmulh v20.8H, v13.8H, v1.H[5] // ..........................e................................................................... + mul v5.8H, v13.8H, v1.H[4] // .........................e.................................................................... + add v6.8H, v28.8H, v23.8H // .......................................e...................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + add v24.8H, v12.8H, v11.8H // .........e.................................................................................... + // gap // .............................................................................................. + sub v18.8H, v28.8H, v23.8H // ......................................e....................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v9.8H, v27.8H, v1.H[2] // ....................e......................................................................... + sqrdmulh v15.8H, v27.8H, v1.H[3] // .....................e........................................................................ + sub v11.8H, v12.8H, v11.8H // ........e..................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v26.8H, v10.8H, v7.H[0] // ..........................................................*................................... + sqdmulh v31.8H, v6.8H, v7.H[1] // ...................................................e.......................................... + add v28.8H, v24.8H, v22.8H // .............................e................................................................ + sub v22.8H, v24.8H, v22.8H // ............................e................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v27.8H, v21.8H, v25.8H // .....................................................................*........................ + mul v13.8H, v18.8H, v0.H[4] // ........................................e..................................................... + sqrdmulh v25.8H, v11.8H, v0.H[7] // ...........e.................................................................................. + mul v12.8H, v11.8H, v0.H[6] // ..........e................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v10.8H, v22.8H, v0.H[3] // ...............................e.............................................................. + mul v24.8H, v22.8H, v0.H[2] // ..............................e............................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v9.8H, v15.8H, v7.H[0] // ......................e....................................................................... + sqrdmulh v21.8H, v18.8H, v0.H[5] // .........................................e.................................................... + srshr v23.8H, v31.8H, #11 // ....................................................e......................................... + mul v16.8H, v16.8H, v29.8H // .......................................................................................*...... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqdmulh v18.8H, v28.8H, v7.H[1] // ................................................e............................................. + mls v5.8H, v20.8H, v7.H[0] // ...........................e.................................................................. + str q26, [x0, #256] // ..........................................................................*................... + sqrdmulh v26.8H, v27.8H, v0.H[1] // ........................................................................*..................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v8.8H, v19.8H, v7.H[0] // .................e............................................................................ + mul v11.8H, v27.8H, v0.H[0] // .......................................................................*...................... + mls v12.8H, v25.8H, v7.H[0] // ............e................................................................................. + mls v4.8H, v17.8H, v7.H[0] // ................................................................................*............. + // gap // .............................................................................................. + mls v13.8H, v21.8H, v7.H[0] // ..........................................e................................................... + mls v24.8H, v10.8H, v7.H[0] // ................................e............................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v14.8H, v3.8H, v7.H[0] // ...................................................................................*.......... + mls v6.8H, v23.8H, v7.H[0] // .....................................................e........................................ + mls v16.8H, v2.8H, v7.H[0] // .........................................................................................*.... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + srshr v21.8H, v18.8H, #11 // .................................................e............................................ + sub v20.8H, v9.8H, v5.8H // ...........................................e.................................................. + add v27.8H, v12.8H, v8.8H // ..................................e........................................................... + // gap // .............................................................................................. + mls v11.8H, v26.8H, v7.H[0] // .........................................................................*.................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + add v19.8H, v9.8H, v5.8H // ............................................e................................................. + sub v26.8H, v12.8H, v8.8H // .................................e............................................................ + mul v25.8H, v20.8H, v0.H[4] // .............................................e................................................ + sqrdmulh v3.8H, v20.8H, v0.H[5] // ..............................................e............................................... + sub v2.8H, v24.8H, v13.8H // ................................................................e............................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + add v17.8H, v24.8H, v13.8H // .................................................................e............................ + str q16, [x0, #192] // .............................................................................................* + sub v13.8H, v27.8H, v19.8H // ...........................................................e.................................. + str q4, [x0], #(16) // ..........................................................................................*... + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v28.8H, v21.8H, v7.H[0] // ..................................................e........................................... + sqrdmulh v4.8H, v26.8H, v0.H[3] // ....................................e......................................................... + mul v21.8H, v26.8H, v0.H[2] // ...................................e.......................................................... + mul v20.8H, v2.8H, v0.H[0] // ..................................................................e........................... + // gap // .............................................................................................. + sqrdmulh v16.8H, v2.8H, v0.H[1] // ...................................................................e.......................... + mul v2.8H, v17.8H, v29.8H // ....................................................................................e......... + sqrdmulh v17.8H, v17.8H, v30.8H // .....................................................................................e........ + // gap // .............................................................................................. + str q11, [x0, #432] // .............................................................................*................ + str q14, [x0, #48] // ...........................................................................................*.. + + // original source code + // ldr q8, [x0, #0] // ................e.............................................................................|...............e............................................................................. + // ldr q9, [x0, #(1*(512/8))] // ...............e..............................................................................|..............e.............................................................................. + // ldr q10, [x0, #(2*(512/8))] // ...e..........................................................................................|..e.......................................................................................... + // ldr q11, [x0, #(3*(512/8))] // ....e.........................................................................................|...e......................................................................................... + // ldr q12, [x0, #(4*(512/8))] // ..........e...................................................................................|.........e................................................................................... + // ldr q13, [x0, #(5*(512/8))] // .........e....................................................................................|........e.................................................................................... + // ldr q14, [x0, #(6*(512/8))] // ........e.....................................................................................|.......e..................................................................................... + // ldr q15, [x0, #(7*(512/8))] // e.............................................................................................e............................................................................................. + // sub v24.8h, v8.8h, v9.8h // ............................................e.................................................|...........................................e................................................. + // add v8.8h, v8.8h, v9.8h // ........................................e.....................................................|.......................................e..................................................... + // mul v9.8h, v24.8h, v0.h[6] // ....................................................e.........................................|...................................................e......................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // ...................................................e..........................................|..................................................e.......................................... + // mls v9.8h, v24.8h, v7.h[0] // .................................................................e............................|................................................................e............................ + // sub v24.8h, v10.8h, v11.8h // .......................e......................................................................|......................e...................................................................... + // add v10.8h, v10.8h, v11.8h // .............................e................................................................|............................e................................................................ + // mul v11.8h, v24.8h, v1.h[0] // .................................e............................................................|................................e............................................................ + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ................................e.............................................................|...............................e............................................................. + // mls v11.8h, v24.8h, v7.h[0] // ...............................................................e..............................|..............................................................e.............................. + // sub v24.8h, v12.8h, v13.8h // ...................................e..........................................................|..................................e.......................................................... + // add v12.8h, v12.8h, v13.8h // ............................e.................................................................|...........................e................................................................. + // mul v13.8h, v24.8h, v1.h[2] // ..........................................e...................................................|.........................................e................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // ...........................................e..................................................|..........................................e.................................................. + // mls v13.8h, v24.8h, v7.h[0] // .......................................................e......................................|......................................................e...................................... + // sub v24.8h, v14.8h, v15.8h // ..............................e...............................................................|.............................e............................................................... + // add v14.8h, v14.8h, v15.8h // ...............................e..............................................................|..............................e.............................................................. + // mul v15.8h, v24.8h, v1.h[4] // ......................................e.......................................................|.....................................e....................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[5] // .....................................e........................................................|....................................e........................................................ + // mls v15.8h, v24.8h, v7.h[0] // ............................................................e.................................|...........................................................e................................. + // sub v24.8h, v8.8h, v10.8h // ................................................e.............................................|...............................................e............................................. + // add v8.8h, v8.8h, v10.8h // ...............................................e..............................................|..............................................e.............................................. + // mul v10.8h, v24.8h, v0.h[2] // ......................................................e.......................................|.....................................................e....................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .....................................................e........................................|....................................................e........................................ + // mls v10.8h, v24.8h, v7.h[0] // ....................................................................e.........................|...................................................................e......................... + // sub v24.8h, v9.8h, v11.8h // .............................................................................e................|............................................................................e................ + // add v9.8h, v9.8h, v11.8h // ..........................................................................e...................|.........................................................................e................... + // mul v11.8h, v24.8h, v0.h[2] // .......................................................................................e......|......................................................................................e...... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ......................................................................................e.......|.....................................................................................e....... + // mls v11.8h, v24.8h, v7.h[0] // ............*.................................................................................|...........*................................................................................. + // sub v24.8h, v12.8h, v14.8h // .........................................e....................................................|........................................e.................................................... + // add v12.8h, v12.8h, v14.8h // .......................................e......................................................|......................................e...................................................... + // mul v14.8h, v24.8h, v0.h[4] // ..................................................e...........................................|.................................................e........................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ........................................................e.....................................|.......................................................e..................................... + // mls v14.8h, v24.8h, v7.h[0] // ...................................................................e..........................|..................................................................e.......................... + // sub v24.8h, v13.8h, v15.8h // .........................................................................e....................|........................................................................e.................... + // add v13.8h, v13.8h, v15.8h // ............................................................................e.................|...........................................................................e................. + // mul v15.8h, v24.8h, v0.h[4] // ..............................................................................e...............|.............................................................................e............... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...............................................................................e..............|..............................................................................e.............. + // mls v15.8h, v24.8h, v7.h[0] // ...........*..................................................................................|..........*.................................................................................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...........................................................e..................................|..........................................................e.................................. + // srshr v25.8h, v25.8h, #11 // ........................................................................e.....................|.......................................................................e..................... + // mls v8.8h, v25.8h, v7.h[0] // .....................................................................................e........|....................................................................................e........ + // sqdmulh v25.8h, v12.8h, v7.h[1] // ..............................................e...............................................|.............................................e............................................... + // srshr v25.8h, v25.8h, #11 // .........................................................e....................................|........................................................e.................................... + // mls v12.8h, v25.8h, v7.h[0] // ......................................................................e.......................|.....................................................................e....................... + // sub v24.8h, v8.8h, v12.8h // .......*......................................................................................|......*...................................................................................... + // add v8.8h, v8.8h, v12.8h // ......*.......................................................................................|.....*....................................................................................... + // mul v12.8h, v24.8h, v0.h[0] // ....................*.........................................................................|...................*......................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................*.....................................................................|.......................*..................................................................... + // mls v12.8h, v24.8h, v7.h[0] // .............................................*................................................|............................................*................................................ + // sub v24.8h, v9.8h, v13.8h // ...................................................................................e..........|..................................................................................e.......... + // add v9.8h, v9.8h, v13.8h // .*............................................................................................|*............................................................................................ + // mul v13.8h, v24.8h, v0.h[0] // .....*........................................................................................|....*........................................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..*...........................................................................................|.*........................................................................................... + // mls v13.8h, v24.8h, v7.h[0] // .....................*........................................................................|....................*........................................................................ + // sub v24.8h, v10.8h, v14.8h // ................................................................................e.............|...............................................................................e............. + // add v10.8h, v10.8h, v14.8h // .................................................................................e............|................................................................................e............ + // mul v14.8h, v24.8h, v0.h[0] // ........................................................................................e.....|.......................................................................................e..... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .........................................................................................e....|........................................................................................e.... + // mls v14.8h, v24.8h, v7.h[0] // .................*............................................................................|................*............................................................................ + // sub v24.8h, v11.8h, v15.8h // .................................................*............................................|................................................*............................................ + // add v11.8h, v11.8h, v15.8h // .........................*....................................................................|........................*.................................................................... + // mul v15.8h, v24.8h, v0.h[0] // ................................................................*.............................|...............................................................*............................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................................*...............................|.............................................................*............................... + // mls v15.8h, v24.8h, v7.h[0] // ...........................................................................*..................|..........................................................................*.................. + // str q12, [x0, #(4*(512/8))] // .............................................................*................................|............................................................*................................ + // str q13, [x0, #(5*(512/8))] // ..................................*...........................................................|.................................*........................................................... + // str q14, [x0, #(6*(512/8))] // ..........................*...................................................................|.........................*................................................................... + // str q15, [x0, #(7*(512/8))] // ............................................................................................*.|...........................................................................................*. + // mul v12.8h, v8.8h, v29.8h // ...................*..........................................................................|..................*.......................................................................... + // sqrdmulh v8.8h, v8.8h, v30.8h // ......................*.......................................................................|.....................*....................................................................... + // mls v12.8h, v8.8h, v7.h[0] // ..................................................................*...........................|.................................................................*........................... + // mul v13.8h, v9.8h, v29.8h // .............*................................................................................|............*................................................................................ + // sqrdmulh v9.8h, v9.8h, v30.8h // ..............*...............................................................................|.............*............................................................................... + // mls v13.8h, v9.8h, v7.h[0] // .....................................................................*........................|....................................................................*........................ + // mul v14.8h, v10.8h, v29.8h // ..........................................................................................e...|.........................................................................................e... + // sqrdmulh v10.8h, v10.8h, v30.8h // ...........................................................................................e..|..........................................................................................e.. + // mls v14.8h, v10.8h, v7.h[0] // ..................*...........................................................................|.................*........................................................................... + // mul v15.8h, v11.8h, v29.8h // ..........................................................*...................................|.........................................................*................................... + // sqrdmulh v11.8h, v11.8h, v30.8h // ....................................*.........................................................|...................................*......................................................... + // mls v15.8h, v11.8h, v7.h[0] // .......................................................................*......................|......................................................................*...................... + // str q12, [x0], #(16) // ....................................................................................*.........|...................................................................................*......... + // str q13, [x0, #(-16 + 1*(512/8))] // .............................................................................................*|............................................................................................* + // str q14, [x0, #(-16 + 2*(512/8))] // ...........................*..................................................................|..........................*.................................................................. + // str q15, [x0, #(-16 + 3*(512/8))] // ..................................................................................*...........|.................................................................................*........... + + sub count, count, #1 + cbnz count, layer123_start + mls v21.8H, v4.8H, v7.H[0] // ......*............................ + mls v25.8H, v3.8H, v7.H[0] // .....*............................. + sub v31.8H, v28.8H, v6.8H // ....*.............................. + add v22.8H, v28.8H, v6.8H // ...*............................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mul v23.8H, v13.8H, v0.H[0] // ..*................................ + sqrdmulh v8.8H, v13.8H, v0.H[1] // .*................................. + mls v2.8H, v17.8H, v7.H[0] // ..........*........................ + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + sqrdmulh v11.8H, v22.8H, v30.8H // ..............*.................... + mul v9.8H, v22.8H, v29.8H // ...........*....................... + sqrdmulh v17.8H, v31.8H, v0.H[1] // ...............*................... + add v18.8H, v27.8H, v19.8H // *.................................. + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + sub v22.8H, v21.8H, v25.8H // ......................*............ + add v21.8H, v21.8H, v25.8H // ................*.................. + mul v19.8H, v31.8H, v0.H[0] // ............*...................... + mls v20.8H, v16.8H, v7.H[0] // .........*......................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mls v23.8H, v8.8H, v7.H[0] // .............*..................... + mul v6.8H, v18.8H, v29.8H // .......*........................... + sqrdmulh v26.8H, v18.8H, v30.8H // ........*.......................... + str q2, [x0, #128] // ..................*................ + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + sqrdmulh v2.8H, v21.8H, v30.8H // ....................*.............. + mul v28.8H, v21.8H, v29.8H // .......................*........... + mul v13.8H, v22.8H, v0.H[0] // ..........................*........ + sqrdmulh v27.8H, v22.8H, v0.H[1] // .........................*......... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mls v9.8H, v11.8H, v7.H[0] // ...........................*....... + mls v19.8H, v17.8H, v7.H[0] // .....................*............. + str q20, [x0, #384] // .................*................. + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + str q23, [x0, #320] // ...................*............... + mls v6.8H, v26.8H, v7.H[0] // ............................*...... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mls v28.8H, v2.8H, v7.H[0] // .............................*..... + mls v13.8H, v27.8H, v7.H[0] // ..............................*.... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + str q9, [x0], #(16) // ................................*.. + str q19, [x0, #240] // ........................*.......... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + str q6, [x0, #48] // ..................................* + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + str q28, [x0, #176] // ...............................*... + str q13, [x0, #432] // .................................*. + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + + // original source code + // add v26.8H, v27.8H, v19.8H // ..........*........................ + // sqrdmulh v23.8H, v13.8H, v0.H[1] // .....*............................. + // mul v27.8H, v13.8H, v0.H[0] // ....*.............................. + // add v10.8H, v28.8H, v6.8H // ...*............................... + // sub v13.8H, v28.8H, v6.8H // ..*................................ + // mls v25.8H, v3.8H, v7.H[0] // .*................................. + // mls v21.8H, v4.8H, v7.H[0] // *.................................. + // mul v14.8H, v26.8H, v29.8H // ................*.................. + // sqrdmulh v3.8H, v26.8H, v30.8H // .................*................. + // mls v20.8H, v16.8H, v7.H[0] // ..............*.................... + // mls v2.8H, v17.8H, v7.H[0] // ......*............................ + // mul v4.8H, v10.8H, v29.8H // ........*.......................... + // mul v26.8H, v13.8H, v0.H[0] // .............*..................... + // mls v27.8H, v23.8H, v7.H[0] // ...............*................... + // sqrdmulh v17.8H, v10.8H, v30.8H // .......*........................... + // sqrdmulh v10.8H, v13.8H, v0.H[1] // .........*......................... + // add v16.8H, v21.8H, v25.8H // ............*...................... + // str q20, [x0, #384] // .........................*......... + // str q2, [x0, #128] // ..................*................ + // str q27, [x0, #320] // ..........................*........ + // sqrdmulh v2.8H, v16.8H, v30.8H // ...................*............... + // mls v26.8H, v10.8H, v7.H[0] // ........................*.......... + // sub v27.8H, v21.8H, v25.8H // ...........*....................... + // mul v16.8H, v16.8H, v29.8H // ....................*.............. + // str q26, [x0, #256] // ...............................*... + // sqrdmulh v26.8H, v27.8H, v0.H[1] // ......................*............ + // mul v11.8H, v27.8H, v0.H[0] // .....................*............. + // mls v4.8H, v17.8H, v7.H[0] // .......................*........... + // mls v14.8H, v3.8H, v7.H[0] // ...........................*....... + // mls v16.8H, v2.8H, v7.H[0] // ............................*...... + // mls v11.8H, v26.8H, v7.H[0] // .............................*..... + // str q16, [x0, #192] // .................................*. + // str q4, [x0], #(16) // ..............................*.... + // str q11, [x0, #432] // ..................................* + // str q14, [x0, #48] // ................................*.. + + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_m1_icestorm.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_m1_icestorm.s new file mode 100644 index 0000000..82bdd3f --- /dev/null +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_m1_icestorm.s @@ -0,0 +1,1420 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_opt_m1_icestorm + .global _intt_kyber_123_4567_opt_m1_icestorm + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_opt_m1_icestorm: +_intt_kyber_123_4567_opt_m1_icestorm: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 + ldr q0, [x1, #32] // .*............................................... + ldr q2, [x1, #48] // *................................................ + // gap // ................................................. + // gap // ................................................. + ldr q16, [x1, #0] // ..*.............................................. + ldr q23, [x1, #16] // ...*............................................. + // gap // ................................................. + // gap // ................................................. + ldr q21, [x4, #64] // ....*............................................ + ldr q26, [x4], #(6*16) // .....*........................................... + // gap // ................................................. + // gap // ................................................. + ldr q20, [x4, #-16] // ......*.......................................... + ldr q17, [x4, #-48] // .........*....................................... + // gap // ................................................. + // gap // ................................................. + trn1 v30.4S, v0.4S, v2.4S // .......*......................................... + trn2 v0.4S, v0.4S, v2.4S // ........*........................................ + ldr q8, [x4, #-64] // ..........*...................................... + ldr q4, [x4, #-80] // .............*................................... + trn1 v25.4S, v16.4S, v23.4S // ...........*..................................... + trn2 v16.4S, v16.4S, v23.4S // ............*.................................... + ldr q15, [x3], #16 // ......................................*.......... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn2 v11.2D, v16.2D, v0.2D // ...............*................................. + trn2 v23.2D, v25.2D, v30.2D // ..............*.................................. + // gap // ................................................. + // gap // ................................................. + trn1 v30.2D, v25.2D, v30.2D // .................*............................... + trn1 v0.2D, v16.2D, v0.2D // ................*................................ + // gap // ................................................. + // gap // ................................................. + sub v16.8H, v23.8H, v11.8H // ...................*............................. + add v23.8H, v23.8H, v11.8H // ..................*.............................. + // gap // ................................................. + // gap // ................................................. + sub v25.8H, v30.8H, v0.8H // .....................*........................... + add v0.8H, v30.8H, v0.8H // ....................*............................ + // gap // ................................................. + // gap // ................................................. + mul v21.8H, v16.8H, v21.8H // .......................*......................... + sqrdmulh v20.8H, v16.8H, v20.8H // ......................*.......................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sqrdmulh v17.8H, v25.8H, v17.8H // ........................*........................ + // gap // ................................................. + mul v2.8H, v25.8H, v8.8H // .........................*....................... + sub v16.8H, v0.8H, v23.8H // ..........................*...................... + add v0.8H, v0.8H, v23.8H // .....................................*........... + // gap // ................................................. + // gap // ................................................. + mls v21.8H, v20.8H, v7.H[0] // ...........................*..................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sqrdmulh v8.8H, v16.8H, v4.8H // ..............................*.................. + // gap // ................................................. + // gap // ................................................. + mls v2.8H, v17.8H, v7.H[0] // ............................*.................... + mul v23.8H, v16.8H, v26.8H // .............................*................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sub v16.8H, v2.8H, v21.8H // ...............................*................. + // gap // ................................................. + add v2.8H, v2.8H, v21.8H // ..................................*.............. + // gap // ................................................. + mls v23.8H, v8.8H, v7.H[0] // ...................................*............. + mul v21.8H, v16.8H, v26.8H // ................................*................ + // gap // ................................................. + // gap // ................................................. + sqrdmulh v16.8H, v16.8H, v4.8H // .................................*............... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn2 v26.4S, v0.4S, v2.4S // .......................................*......... + trn1 v0.4S, v0.4S, v2.4S // ........................................*........ + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mls v21.8H, v16.8H, v7.H[0] // ....................................*............ + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn1 v16.4S, v23.4S, v21.4S // ..........................................*...... + // gap // ................................................. + // gap // ................................................. + trn2 v2.4S, v23.4S, v21.4S // .........................................*....... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn1 v17.2D, v0.2D, v16.2D // ............................................*.... + // gap // ................................................. + // gap // ................................................. + trn1 v23.2D, v26.2D, v2.2D // ...........................................*..... + trn2 v21.2D, v26.2D, v2.2D // .............................................*... + trn2 v20.2D, v0.2D, v16.2D // ..............................................*.. + // gap // ................................................. + // gap // ................................................. + sub v26.8H, v17.8H, v23.8H // ................................................* + // gap // ................................................. + // gap // ................................................. + add v4.8H, v17.8H, v23.8H // ...............................................*. + + // original source code + // ldr q16, [x1, #48] // .*............................................... + // ldr q1, [x1, #32] // *................................................ + // ldr q19, [x1, #0] // ..*.............................................. + // ldr q23, [x1, #16] // ...*............................................. + // ldr q29, [x4, #64] // ....*............................................ + // ldr q14, [x4], #(6*16) // .....*........................................... + // ldr q28, [x4, #-16] // ......*.......................................... + // trn1 v10.4S, v1.4S, v16.4S // ........*........................................ + // trn2 v26.4S, v1.4S, v16.4S // .........*....................................... + // ldr q1, [x4, #-48] // .......*......................................... + // ldr q3, [x4, #-64] // ..........*...................................... + // trn1 v8.4S, v19.4S, v23.4S // ............*.................................... + // trn2 v24.4S, v19.4S, v23.4S // .............*................................... + // ldr q11, [x4, #-80] // ...........*..................................... + // trn2 v25.2D, v8.2D, v10.2D // ................*................................ + // trn2 v16.2D, v24.2D, v26.2D // ...............*................................. + // trn1 v13.2D, v24.2D, v26.2D // ..................*.............................. + // trn1 v5.2D, v8.2D, v10.2D // .................*............................... + // add v2.8H, v25.8H, v16.8H // ....................*............................ + // sub v16.8H, v25.8H, v16.8H // ...................*............................. + // add v19.8H, v5.8H, v13.8H // ......................*.......................... + // sub v25.8H, v5.8H, v13.8H // .....................*........................... + // sqrdmulh v26.8H, v16.8H, v28.8H // ........................*........................ + // mul v10.8H, v16.8H, v29.8H // .......................*......................... + // sqrdmulh v16.8H, v25.8H, v1.8H // .........................*....................... + // mul v13.8H, v25.8H, v3.8H // ..........................*...................... + // sub v23.8H, v19.8H, v2.8H // ...........................*..................... + // mls v10.8H, v26.8H, v7.H[0] // .............................*................... + // mls v13.8H, v16.8H, v7.H[0] // ...............................*................. + // mul v18.8H, v23.8H, v14.8H // ................................*................ + // sqrdmulh v17.8H, v23.8H, v11.8H // ..............................*.................. + // sub v26.8H, v13.8H, v10.8H // .................................*............... + // mul v23.8H, v26.8H, v14.8H // ....................................*............ + // sqrdmulh v25.8H, v26.8H, v11.8H // .....................................*........... + // add v11.8H, v13.8H, v10.8H // ..................................*.............. + // mls v18.8H, v17.8H, v7.H[0] // ...................................*............. + // mls v23.8H, v25.8H, v7.H[0] // ........................................*........ + // add v2.8H, v19.8H, v2.8H // ............................*.................... + // ldr q15, [x3], #16 // ..............*.................................. + // trn2 v26.4S, v2.4S, v11.4S // ......................................*.......... + // trn1 v30.4S, v2.4S, v11.4S // .......................................*......... + // trn2 v14.4S, v18.4S, v23.4S // ..........................................*...... + // trn1 v17.4S, v18.4S, v23.4S // .........................................*....... + // trn1 v23.2D, v26.2D, v14.2D // ............................................*.... + // trn1 v2.2D, v30.2D, v17.2D // ...........................................*..... + // trn2 v21.2D, v26.2D, v14.2D // .............................................*... + // trn2 v20.2D, v30.2D, v17.2D // ..............................................*.. + // add v4.8H, v2.8H, v23.8H // ................................................* + // sub v26.8H, v2.8H, v23.8H // ...............................................*. + + sub count, count, #1 +layer4567_start: + sub v11.8H, v20.8H, v21.8H // ....................................................*........................ + add v30.8H, v20.8H, v21.8H // .....................................................*....................... + ldr q16, [x1, #112] // ...e......................................................................... + ldr q1, [x1, #96] // ..e.......................................................................... + sqrdmulh v17.8H, v26.8H, v15.H[3] // ..................................................*.......................... + ldr q19, [x1, #64] // e............................................................................ + sqdmulh v2.8H, v4.8H, v7.H[1] // .........................................................*................... + ldr q23, [x1, #80] // .e........................................................................... + mul v20.8H, v11.8H, v15.H[4] // ......................................................*...................... + mul v22.8H, v26.8H, v15.H[2] // .................................................*........................... + ldr q29, [x4, #64] // ................e............................................................ + ldr q14, [x4], #(6*16) // ............e................................................................ + sqrdmulh v21.8H, v11.8H, v15.H[5] // .......................................................*..................... + ldr q28, [x4, #-16] // .................e........................................................... + sqdmulh v0.8H, v30.8H, v7.H[1] // ............................................................*................ + // gap // ............................................................................. + trn1 v10.4S, v1.4S, v16.4S // ......e...................................................................... + trn2 v26.4S, v1.4S, v16.4S // .......e..................................................................... + ldr q1, [x4, #-48] // ...............e............................................................. + // gap // ............................................................................. + ldr q3, [x4, #-64] // ..............e.............................................................. + trn1 v8.4S, v19.4S, v23.4S // ....e........................................................................ + trn2 v24.4S, v19.4S, v23.4S // .....e....................................................................... + // gap // ............................................................................. + srshr v23.8H, v2.8H, #11 // ..........................................................*.................. + srshr v0.8H, v0.8H, #11 // .............................................................*............... + ldr q11, [x4, #-80] // .............e............................................................... + // gap // ............................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + trn2 v25.2D, v8.2D, v10.2D // ........e.................................................................... + trn2 v16.2D, v24.2D, v26.2D // .........e................................................................... + trn1 v13.2D, v24.2D, v26.2D // ...........e................................................................. + trn1 v5.2D, v8.2D, v10.2D // ..........e.................................................................. + // gap // ............................................................................. + // gap // ............................................................................. + add v2.8H, v25.8H, v16.8H // ........................e.................................................... + sub v16.8H, v25.8H, v16.8H // .......................e..................................................... + // gap // ............................................................................. + // gap // ............................................................................. + add v19.8H, v5.8H, v13.8H // ...................e......................................................... + sub v25.8H, v5.8H, v13.8H // ..................e.......................................................... + // gap // ............................................................................. + // gap // ............................................................................. + sqrdmulh v26.8H, v16.8H, v28.8H // ..........................e.................................................. + mul v10.8H, v16.8H, v29.8H // .........................e................................................... + // gap // ............................................................................. + // gap // ............................................................................. + sqrdmulh v16.8H, v25.8H, v1.8H // .....................e....................................................... + mul v13.8H, v25.8H, v3.8H // ....................e........................................................ + // gap // ............................................................................. + // gap // ............................................................................. + mls v4.8H, v23.8H, v7.H[0] // ...........................................................*................. + mls v30.8H, v0.8H, v7.H[0] // ..............................................................*.............. + // gap // ............................................................................. + // gap // ............................................................................. + sub v23.8H, v19.8H, v2.8H // ............................e................................................ + // gap // ............................................................................. + // gap // ............................................................................. + mls v10.8H, v26.8H, v7.H[0] // ...........................e................................................. + mls v13.8H, v16.8H, v7.H[0] // ......................e...................................................... + // gap // ............................................................................. + mls v22.8H, v17.8H, v7.H[0] // ...................................................*......................... + // gap // ............................................................................. + mul v18.8H, v23.8H, v14.8H // ..............................e.............................................. + // gap // ............................................................................. + // gap // ............................................................................. + add v0.8H, v4.8H, v30.8H // ................................................................*............ + sqrdmulh v17.8H, v23.8H, v11.8H // ...............................e............................................. + sub v16.8H, v4.8H, v30.8H // ...............................................................*............. + // gap // ............................................................................. + // gap // ............................................................................. + str q0, [x1], #(64) // .........................................................................*... + mls v20.8H, v21.8H, v7.H[0] // ........................................................*.................... + sub v26.8H, v13.8H, v10.8H // .................................e........................................... + // gap // ............................................................................. + mul v3.8H, v16.8H, v15.H[0] // .................................................................*........... + sqrdmulh v0.8H, v16.8H, v15.H[1] // ..................................................................*.......... + // gap // ............................................................................. + // gap // ............................................................................. + mul v23.8H, v26.8H, v14.8H // ...................................e......................................... + // gap // ............................................................................. + // gap // ............................................................................. + sqrdmulh v25.8H, v26.8H, v11.8H // ....................................e........................................ + add v11.8H, v13.8H, v10.8H // ..................................e.......................................... + mls v18.8H, v17.8H, v7.H[0] // ................................e............................................ + // gap // ............................................................................. + // gap // ............................................................................. + mls v3.8H, v0.8H, v7.H[0] // ...................................................................*......... + sub v31.8H, v22.8H, v20.8H // ....................................................................*........ + // gap // ............................................................................. + // gap // ............................................................................. + mls v23.8H, v25.8H, v7.H[0] // .....................................e....................................... + add v2.8H, v19.8H, v2.8H // .............................e............................................... + // gap // ............................................................................. + // gap // ............................................................................. + mul v16.8H, v31.8H, v15.H[0] // ......................................................................*...... + // gap // ............................................................................. + sqrdmulh v0.8H, v31.8H, v15.H[1] // .......................................................................*..... + ldr q15, [x3], #16 // ..............................................e.............................. + str q3, [x1, #-32] // ...........................................................................*. + trn2 v26.4S, v2.4S, v11.4S // .......................................e..................................... + trn1 v30.4S, v2.4S, v11.4S // ......................................e...................................... + // gap // ............................................................................. + trn2 v14.4S, v18.4S, v23.4S // .........................................e................................... + trn1 v17.4S, v18.4S, v23.4S // ........................................e.................................... + // gap // ............................................................................. + // gap // ............................................................................. + add v4.8H, v22.8H, v20.8H // .....................................................................*....... + // gap // ............................................................................. + // gap // ............................................................................. + mls v16.8H, v0.8H, v7.H[0] // ........................................................................*.... + trn1 v23.2D, v26.2D, v14.2D // .............................................e............................... + trn1 v2.2D, v30.2D, v17.2D // ............................................e................................ + // gap // ............................................................................. + // gap // ............................................................................. + str q4, [x1, #-48] // ..........................................................................*.. + trn2 v21.2D, v26.2D, v14.2D // ...........................................e................................. + trn2 v20.2D, v30.2D, v17.2D // ..........................................e.................................. + // gap // ............................................................................. + add v4.8H, v2.8H, v23.8H // ................................................e............................ + sub v26.8H, v2.8H, v23.8H // ...............................................e............................. + str q16, [x1, #-16] // ............................................................................* + // gap // ............................................................................. + + // original source code + // ldr q8, [x1, #(16*0)] // ...e.......................................................................|....e....................................................................... + // ldr q9, [x1, #(16*1)] // .....e.....................................................................|......e..................................................................... + // ldr q10, [x1, #(16*2)] // .e.........................................................................|..e......................................................................... + // ldr q11, [x1, #(16*3)] // e..........................................................................|.e.......................................................................... + // trn1 v25.4s, v8.4s, v9.4s // .................e.........................................................|..................e......................................................... + // trn2 v26.4s, v8.4s, v9.4s // ..................e........................................................|...................e........................................................ + // trn1 v27.4s, v10.4s, v11.4s // .............e.............................................................|..............e............................................................. + // trn2 v28.4s, v10.4s, v11.4s // ..............e............................................................|...............e............................................................ + // trn2 v10.2d, v25.2d, v27.2d // ......................e....................................................|.......................e.................................................... + // trn2 v11.2d, v26.2d, v28.2d // .......................e...................................................|........................e................................................... + // trn1 v8.2d, v25.2d, v27.2d // .........................e.................................................|..........................e................................................. + // trn1 v9.2d, v26.2d, v28.2d // ........................e..................................................|.........................e.................................................. + // ldr q0, [x4], #(6*16) // .........e.................................................................|..........e................................................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // .....................e.....................................................|......................e..................................................... + // ldr q1, [x4, #(-6*16 + 2*16)] // ................e..........................................................|.................e.......................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ...............e...........................................................|................e........................................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // ........e..................................................................|.........e.................................................................. + // ldr q6, [x4, #(-6*16 + 5*16)] // ...........e...............................................................|............e............................................................... + // sub v24.8h, v8.8h, v9.8h // .............................e.............................................|..............................e............................................. + // add v8.8h, v8.8h, v9.8h // ............................e..............................................|.............................e.............................................. + // mul v9.8h, v24.8h, v1.8h // .................................e.........................................|..................................e......................................... + // sqrdmulh v24.8h, v24.8h, v5.8h // ................................e..........................................|.................................e.......................................... + // mls v9.8h, v24.8h, v7.h[0] // ......................................e....................................|.......................................e.................................... + // sub v24.8h, v10.8h, v11.8h // ...........................e...............................................|............................e............................................... + // add v10.8h, v10.8h, v11.8h // ..........................e................................................|...........................e................................................ + // mul v11.8h, v24.8h, v2.8h // ...............................e...........................................|................................e........................................... + // sqrdmulh v24.8h, v24.8h, v6.8h // ..............................e............................................|...............................e............................................ + // mls v11.8h, v24.8h, v7.h[0] // .....................................e.....................................|......................................e..................................... + // sub v24.8h, v8.8h, v10.8h // ....................................e......................................|.....................................e...................................... + // add v8.8h, v8.8h, v10.8h // ........................................................e..................|.........................................................e.................. + // mul v10.8h, v24.8h, v0.8h // ........................................e..................................|.........................................e.................................. + // sqrdmulh v24.8h, v24.8h, v4.8h // ..........................................e................................|...........................................e................................ + // mls v10.8h, v24.8h, v7.h[0] // ....................................................e......................|.....................................................e...................... + // sub v24.8h, v9.8h, v11.8h // ..............................................e............................|...............................................e............................ + // add v9.8h, v9.8h, v11.8h // ...................................................e.......................|....................................................e....................... + // mul v11.8h, v24.8h, v0.8h // .................................................e.........................|..................................................e......................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ..................................................e........................|...................................................e........................ + // mls v11.8h, v24.8h, v7.h[0] // .......................................................e...................|........................................................e................... + // trn1 v25.4s, v8.4s, v9.4s // ..............................................................e............|...............................................................e............ + // trn2 v26.4s, v8.4s, v9.4s // .............................................................e.............|..............................................................e............. + // trn1 v27.4s, v10.4s, v11.4s // ................................................................e..........|.................................................................e.......... + // trn2 v28.4s, v10.4s, v11.4s // ...............................................................e...........|................................................................e........... + // trn2 v10.2d, v25.2d, v27.2d // .......................................................................e...|........................................................................e... + // trn2 v11.2d, v26.2d, v28.2d // ......................................................................e....|.......................................................................e.... + // trn1 v8.2d, v25.2d, v27.2d // ....................................................................e......|.....................................................................e...... + // trn1 v9.2d, v26.2d, v28.2d // ...................................................................e.......|....................................................................e....... + // ldr q0, [x3], #16 // ...........................................................e...............|............................................................e............... + // sub v24.8h, v8.8h, v9.8h // .........................................................................e.|..........................................................................e. + // add v8.8h, v8.8h, v9.8h // ........................................................................e..|.........................................................................e.. + // mul v9.8h, v24.8h, v0.h[2] // .......*...................................................................|........*................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..*........................................................................|...*........................................................................ + // mls v9.8h, v24.8h, v7.h[0] // .......................................*...................................|........................................*................................... + // sub v24.8h, v10.8h, v11.8h // ...........................................................................*............................................................................ + // add v10.8h, v10.8h, v11.8h // ...........................................................................|*........................................................................... + // mul v11.8h, v24.8h, v0.h[4] // ......*....................................................................|.......*.................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..........*................................................................|...........*................................................................ + // mls v11.8h, v24.8h, v7.h[0] // .............................................*.............................|..............................................*............................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // ....*......................................................................|.....*...................................................................... + // srshr v25.8h, v25.8h, #11 // ...................*.......................................................|....................*....................................................... + // mls v8.8h, v25.8h, v7.h[0] // ..................................*........................................|...................................*........................................ + // sqdmulh v25.8h, v10.8h, v7.h[1] // ............*..............................................................|.............*.............................................................. + // srshr v25.8h, v25.8h, #11 // ....................*......................................................|.....................*...................................................... + // mls v10.8h, v25.8h, v7.h[0] // ...................................*.......................................|....................................*....................................... + // sub v24.8h, v8.8h, v10.8h // ...........................................*...............................|............................................*............................... + // add v8.8h, v8.8h, v10.8h // .........................................*.................................|..........................................*................................. + // mul v10.8h, v24.8h, v0.h[0] // ...............................................*...........................|................................................*........................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ................................................*..........................|.................................................*.......................... + // mls v10.8h, v24.8h, v7.h[0] // .....................................................*.....................|......................................................*..................... + // sub v24.8h, v9.8h, v11.8h // ......................................................*....................|.......................................................*.................... + // add v9.8h, v9.8h, v11.8h // .................................................................*.........|..................................................................*......... + // mul v11.8h, v24.8h, v0.h[0] // .........................................................*.................|..........................................................*................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........................................................*................|...........................................................*................ + // mls v11.8h, v24.8h, v7.h[0] // ..................................................................*........|...................................................................*........ + // str q8, [x1], #(64) // ............................................*..............................|.............................................*.............................. + // str q9, [x1, #(-64 + 16*1)] // .....................................................................*.....|......................................................................*..... + // str q10, [x1, #(-64 + 16*2)] // ............................................................*..............|.............................................................*.............. + // str q11, [x1, #(-64 + 16*3)] // ..........................................................................*|...........................................................................* + + sub count, count, #1 + cbnz count, layer4567_start + sub v0.8H, v20.8H, v21.8H // *........................... + add v2.8H, v20.8H, v21.8H // .*.......................... + // gap // ............................ + // gap // ............................ + sqdmulh v16.8H, v4.8H, v7.H[1] // ...*........................ + sqrdmulh v23.8H, v26.8H, v15.H[3] // ..*......................... + // gap // ............................ + // gap // ............................ + mul v21.8H, v26.8H, v15.H[2] // .....*...................... + sqdmulh v26.8H, v2.8H, v7.H[1] // .......*.................... + // gap // ............................ + // gap // ............................ + mul v20.8H, v0.8H, v15.H[4] // ....*....................... + sqrdmulh v0.8H, v0.8H, v15.H[5] // ......*..................... + // gap // ............................ + // gap // ............................ + srshr v16.8H, v16.8H, #11 // ........*................... + // gap // ............................ + // gap // ............................ + // gap // ............................ + mls v21.8H, v23.8H, v7.H[0] // ............*............... + srshr v23.8H, v26.8H, #11 // .........*.................. + // gap // ............................ + // gap // ............................ + mls v20.8H, v0.8H, v7.H[0] // ................*........... + // gap // ............................ + // gap // ............................ + // gap // ............................ + mls v4.8H, v16.8H, v7.H[0] // ..........*................. + // gap // ............................ + // gap // ............................ + // gap // ............................ + mls v2.8H, v23.8H, v7.H[0] // ...........*................ + // gap // ............................ + // gap // ............................ + // gap // ............................ + sub v16.8H, v21.8H, v20.8H // ....................*....... + add v0.8H, v21.8H, v20.8H // ........................*... + // gap // ............................ + // gap // ............................ + // gap // ............................ + // gap // ............................ + // gap // ............................ + // gap // ............................ + sub v23.8H, v4.8H, v2.8H // ..............*............. + add v2.8H, v4.8H, v2.8H // .............*.............. + str q0, [x1, #16] // ..........................*. + // gap // ............................ + mul v0.8H, v16.8H, v15.H[0] // .....................*...... + sqrdmulh v16.8H, v16.8H, v15.H[1] // ......................*..... + // gap // ............................ + // gap // ............................ + mul v21.8H, v23.8H, v15.H[0] // .................*.......... + sqrdmulh v23.8H, v23.8H, v15.H[1] // ..................*......... + str q2, [x1], #(64) // ...............*............ + // gap // ............................ + // gap // ............................ + // gap // ............................ + // gap // ............................ + // gap // ............................ + mls v0.8H, v16.8H, v7.H[0] // .........................*.. + // gap // ............................ + // gap // ............................ + // gap // ............................ + mls v21.8H, v23.8H, v7.H[0] // ...................*........ + // gap // ............................ + // gap // ............................ + // gap // ............................ + // gap // ............................ + // gap // ............................ + // gap // ............................ + // gap // ............................ + str q0, [x1, #-16] // ...........................* + // gap // ............................ + // gap // ............................ + // gap // ............................ + str q21, [x1, #-32] // .......................*.... + // gap // ............................ + // gap // ............................ + // gap // ............................ + + // original source code + // sub v11.8H, v20.8H, v21.8H // *........................... + // add v30.8H, v20.8H, v21.8H // .*.......................... + // sqrdmulh v17.8H, v26.8H, v15.H[3] // ...*........................ + // sqdmulh v2.8H, v4.8H, v7.H[1] // ..*......................... + // mul v20.8H, v11.8H, v15.H[4] // ......*..................... + // mul v22.8H, v26.8H, v15.H[2] // ....*....................... + // sqrdmulh v21.8H, v11.8H, v15.H[5] // .......*.................... + // sqdmulh v0.8H, v30.8H, v7.H[1] // .....*...................... + // srshr v23.8H, v2.8H, #11 // ........*................... + // srshr v0.8H, v0.8H, #11 // ..........*................. + // mls v4.8H, v23.8H, v7.H[0] // ............*............... + // mls v30.8H, v0.8H, v7.H[0] // .............*.............. + // mls v22.8H, v17.8H, v7.H[0] // .........*.................. + // add v0.8H, v4.8H, v30.8H // .................*.......... + // sub v16.8H, v4.8H, v30.8H // ................*........... + // str q0, [x1], #(64) // .......................*.... + // mls v20.8H, v21.8H, v7.H[0] // ...........*................ + // mul v3.8H, v16.8H, v15.H[0] // .....................*...... + // sqrdmulh v0.8H, v16.8H, v15.H[1] // ......................*..... + // mls v3.8H, v0.8H, v7.H[0] // .........................*.. + // sub v31.8H, v22.8H, v20.8H // ..............*............. + // mul v16.8H, v31.8H, v15.H[0] // ...................*........ + // sqrdmulh v0.8H, v31.8H, v15.H[1] // ....................*....... + // str q3, [x1, #-32] // ...........................* + // add v4.8H, v22.8H, v20.8H // ...............*............ + // mls v16.8H, v0.8H, v7.H[0] // ........................*... + // str q4, [x1, #-48] // ..................*......... + // str q16, [x1, #-16] // ..........................*. + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + ldr q28, [x0, #64] // .*...................... + ldr q2, [x0, #0] // *....................... + // gap // ........................ + // gap // ........................ + ldr q17, [x0, #128] // .....*.................. + ldr q19, [x0, #192] // ....*................... + // gap // ........................ + // gap // ........................ + ldr q11, [x0, #448] // ...........*............ + ldr q3, [x0, #320] // ..*..................... + // gap // ........................ + // gap // ........................ + ldr q5, [x0, #384] // ..............*......... + // gap // ........................ + // gap // ........................ + // gap // ........................ + add v12.8H, v2.8H, v28.8H // ...............*........ + sub v2.8H, v2.8H, v28.8H // ......*................. + ldr q15, [x0, #256] // ...*.................... + // gap // ........................ + add v10.8H, v17.8H, v19.8H // ........*............... + sub v31.8H, v17.8H, v19.8H // .........*.............. + // gap // ........................ + // gap // ........................ + sqrdmulh v8.8H, v2.8H, v0.H[7] // .......*................ + mul v14.8H, v2.8H, v0.H[6] // ..........*............. + // gap // ........................ + // gap // ........................ + sqrdmulh v22.8H, v31.8H, v1.H[1] // ................*....... + add v25.8H, v12.8H, v10.8H // ..................*..... + // gap // ........................ + // gap // ........................ + mul v20.8H, v31.8H, v1.H[0] // ............*........... + sub v18.8H, v15.8H, v3.8H // .............*.......... + // gap // ........................ + // gap // ........................ + sqdmulh v2.8H, v25.8H, v7.H[1] // ......................*. + sub v9.8H, v12.8H, v10.8H // .................*...... + // gap // ........................ + // gap // ........................ + sqrdmulh v6.8H, v18.8H, v1.H[3] // .....................*.. + mls v14.8H, v8.8H, v7.H[0] // ...................*.... + // gap // ........................ + // gap // ........................ + mls v20.8H, v22.8H, v7.H[0] // ....................*... + // gap // ........................ + // gap // ........................ + sqrdmulh v8.8H, v9.8H, v0.H[3] // .......................* + + // original source code + // ldr q31, [x0, #0] // .*...................... + // ldr q28, [x0, #64] // *....................... + // ldr q3, [x0, #320] // .....*.................. + // ldr q15, [x0, #256] // .........*.............. + // ldr q10, [x0, #192] // ...*.................... + // ldr q21, [x0, #128] // ..*..................... + // sub v11.8H, v31.8H, v28.8H // ........*............... + // sqrdmulh v4.8H, v11.8H, v0.H[7] // ............*........... + // add v27.8H, v21.8H, v10.8H // ..........*............. + // sub v6.8H, v21.8H, v10.8H // ...........*............ + // mul v14.8H, v11.8H, v0.H[6] // .............*.......... + // ldr q11, [x0, #448] // ....*................... + // mul v20.8H, v6.8H, v1.H[0] // ................*....... + // sub v18.8H, v15.8H, v3.8H // .................*...... + // ldr q5, [x0, #384] // ......*................. + // add v12.8H, v31.8H, v28.8H // .......*................ + // sqrdmulh v2.8H, v6.8H, v1.H[1] // ..............*......... + // sub v9.8H, v12.8H, v27.8H // ...................*.... + // add v25.8H, v12.8H, v27.8H // ...............*........ + // mls v14.8H, v4.8H, v7.H[0] // .....................*.. + // mls v20.8H, v2.8H, v7.H[0] // ......................*. + // sqrdmulh v6.8H, v18.8H, v1.H[3] // ....................*... + // sqdmulh v2.8H, v25.8H, v7.H[1] // ..................*..... + // sqrdmulh v8.8H, v9.8H, v0.H[3] // .......................* + + sub count, count, #1 +layer123_start: + add v17.8H, v15.8H, v3.8H // ...................*.......................................................................... + mul v22.8H, v18.8H, v1.H[2] // ....................*......................................................................... + ldr q31, [x0, #16] // e............................................................................................. + ldr q28, [x0, #80] // .e............................................................................................ + add v23.8H, v5.8H, v11.8H // ........................*..................................................................... + ldr q3, [x0, #336] // .....e........................................................................................ + ldr q15, [x0, #272] // ....e......................................................................................... + sub v18.8H, v14.8H, v20.8H // .................................*............................................................ + ldr q10, [x0, #208] // ...e.......................................................................................... + ldr q21, [x0, #144] // ..e........................................................................................... + sub v4.8H, v5.8H, v11.8H // .......................*...................................................................... + mul v19.8H, v9.8H, v0.H[2] // ..............................*............................................................... + mls v22.8H, v6.8H, v7.H[0] // ......................*....................................................................... + add v6.8H, v17.8H, v23.8H // .......................................*...................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v11.8H, v31.8H, v28.8H // ........e..................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v24.8H, v18.8H, v0.H[3] // ....................................*......................................................... + sub v27.8H, v17.8H, v23.8H // ......................................*....................................................... + sqdmulh v16.8H, v6.8H, v7.H[1] // ...................................................*.......................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v13.8H, v4.8H, v1.H[4] // .........................*.................................................................... + sqrdmulh v26.8H, v4.8H, v1.H[5] // ..........................*................................................................... + mul v17.8H, v18.8H, v0.H[2] // ...................................*.......................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v12.8H, v27.8H, v0.H[4] // ........................................*..................................................... + srshr v16.8H, v16.8H, #11 // ....................................................*......................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + srshr v2.8H, v2.8H, #11 // .................................................*............................................ + mls v13.8H, v26.8H, v7.H[0] // ...........................*.................................................................. + sqrdmulh v23.8H, v27.8H, v0.H[5] // .........................................*.................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v19.8H, v8.8H, v7.H[0] // ................................*............................................................. + sqrdmulh v4.8H, v11.8H, v0.H[7] // ...........e.................................................................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v25.8H, v2.8H, v7.H[0] // ..................................................*........................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v6.8H, v16.8H, v7.H[0] // .....................................................*........................................ + mls v12.8H, v23.8H, v7.H[0] // ..........................................*................................................... + mls v17.8H, v24.8H, v7.H[0] // .....................................*........................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + add v26.8H, v22.8H, v13.8H // ............................................*................................................. + add v27.8H, v21.8H, v10.8H // ..............e............................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v18.8H, v25.8H, v6.8H // ......................................................*....................................... + add v25.8H, v25.8H, v6.8H // .......................................................*...................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + add v8.8H, v19.8H, v12.8H // .................................................................*............................ + sub v23.8H, v19.8H, v12.8H // ................................................................*............................. + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v5.8H, v18.8H, v0.H[0] // ........................................................*..................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v2.8H, v25.8H, v30.8H // ...............................................................................*.............. + mul v25.8H, v25.8H, v29.8H // ..............................................................................*............... + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v16.8H, v8.8H, v30.8H // .....................................................................................*........ + sub v6.8H, v21.8H, v10.8H // .............e................................................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v9.8H, v23.8H, v0.H[1] // ...................................................................*.......................... + add v24.8H, v14.8H, v20.8H // ..................................*........................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v19.8H, v8.8H, v29.8H // ....................................................................................*......... + mls v25.8H, v2.8H, v7.H[0] // ................................................................................*............. + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v12.8H, v22.8H, v13.8H // ...........................................*.................................................. + sqrdmulh v13.8H, v18.8H, v0.H[1] // .........................................................*.................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + sub v22.8H, v24.8H, v26.8H // ...........................................................*.................................. + sqrdmulh v20.8H, v12.8H, v0.H[5] // ..............................................*............................................... + mul v21.8H, v12.8H, v0.H[4] // .............................................*................................................ + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v10.8H, v22.8H, v0.H[1] // ..............................................................*............................... + str q25, [x0], #(16) // ..........................................................................................*... + mls v19.8H, v16.8H, v7.H[0] // ......................................................................................*....... + // gap // .............................................................................................. + add v2.8H, v24.8H, v26.8H // ............................................................*................................. + mul v14.8H, v11.8H, v0.H[6] // ..........e................................................................................... + ldr q11, [x0, #448] // .......e...................................................................................... + // gap // .............................................................................................. + mul v25.8H, v22.8H, v0.H[0] // .............................................................*................................ + mls v21.8H, v20.8H, v7.H[0] // ...............................................*.............................................. + // gap // .............................................................................................. + // gap // .............................................................................................. + sqrdmulh v24.8H, v2.8H, v30.8H // ..................................................................................*........... + str q19, [x0, #112] // ............................................................................................*. + mul v20.8H, v6.8H, v1.H[0] // ...............e.............................................................................. + // gap // .............................................................................................. + sub v18.8H, v15.8H, v3.8H // ..................e........................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v5.8H, v13.8H, v7.H[0] // ..........................................................*................................... + add v26.8H, v17.8H, v21.8H // ......................................................................*....................... + sub v13.8H, v17.8H, v21.8H // .....................................................................*........................ + // gap // .............................................................................................. + // gap // .............................................................................................. + mul v17.8H, v2.8H, v29.8H // .................................................................................*............ + mls v25.8H, v10.8H, v7.H[0] // ...............................................................*.............................. + // gap // .............................................................................................. + // gap // .............................................................................................. + str q5, [x0, #240] // ..........................................................................*................... + ldr q5, [x0, #384] // ......e....................................................................................... + mul v10.8H, v13.8H, v0.H[0] // .......................................................................*...................... + sqrdmulh v2.8H, v13.8H, v0.H[1] // ........................................................................*..................... + mul v16.8H, v26.8H, v29.8H // .......................................................................................*...... + mul v13.8H, v23.8H, v0.H[0] // ..................................................................*........................... + // gap // .............................................................................................. + // gap // .............................................................................................. + str q25, [x0, #304] // ...........................................................................*.................. + sqrdmulh v22.8H, v26.8H, v30.8H // ........................................................................................*..... + mls v17.8H, v24.8H, v7.H[0] // ...................................................................................*.......... + // gap // .............................................................................................. + add v12.8H, v31.8H, v28.8H // .........e.................................................................................... + // gap // .............................................................................................. + // gap // .............................................................................................. + mls v10.8H, v2.8H, v7.H[0] // .........................................................................*.................... + sqrdmulh v2.8H, v6.8H, v1.H[1] // ................e............................................................................. + mls v13.8H, v9.8H, v7.H[0] // ....................................................................*......................... + // gap // .............................................................................................. + // gap // .............................................................................................. + str q17, [x0, #48] // ...........................................................................................*.. + // gap // .............................................................................................. + sub v9.8H, v12.8H, v27.8H // ............................e................................................................. + mls v16.8H, v22.8H, v7.H[0] // .........................................................................................*.... + add v25.8H, v12.8H, v27.8H // .............................e................................................................ + str q10, [x0, #432] // .............................................................................*................ + mls v14.8H, v4.8H, v7.H[0] // ............e................................................................................. + // gap // .............................................................................................. + mls v20.8H, v2.8H, v7.H[0] // .................e............................................................................ + str q13, [x0, #368] // ............................................................................*................. + sqrdmulh v6.8H, v18.8H, v1.H[3] // .....................e........................................................................ + // gap // .............................................................................................. + str q16, [x0, #176] // .............................................................................................* + sqdmulh v2.8H, v25.8H, v7.H[1] // ................................................e............................................. + // gap // .............................................................................................. + sqrdmulh v8.8H, v9.8H, v0.H[3] // ...............................e.............................................................. + + // original source code + // ldr q8, [x0, #0] // e...........................................................................................|.e......................................................................................... + // ldr q9, [x0, #(1*(512/8))] // .e..........................................................................................|..e........................................................................................ + // ldr q10, [x0, #(2*(512/8))] // .......e....................................................................................|........e.................................................................................. + // ldr q11, [x0, #(3*(512/8))] // ......e.....................................................................................|.......e................................................................................... + // ldr q12, [x0, #(4*(512/8))] // ....e.......................................................................................|.....e..................................................................................... + // ldr q13, [x0, #(5*(512/8))] // ...e........................................................................................|....e...................................................................................... + // ldr q14, [x0, #(6*(512/8))] // ....................................................................e.......................|.....................................................................e..................... + // ldr q15, [x0, #(7*(512/8))] // .......................................................e....................................|........................................................e.................................. + // sub v24.8h, v8.8h, v9.8h // ............e...............................................................................|.............e............................................................................. + // add v8.8h, v8.8h, v9.8h // ............................................................................e...............|.............................................................................e............. + // mul v9.8h, v24.8h, v0.h[6] // ......................................................e.....................................|.......................................................e................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // .........................e..................................................................|..........................e................................................................ + // mls v9.8h, v24.8h, v7.h[0] // .....................................................................................e......|......................................................................................e.... + // sub v24.8h, v10.8h, v11.8h // ........................................e...................................................|.........................................e................................................. + // add v10.8h, v10.8h, v11.8h // ...............................e............................................................|................................e.......................................................... + // mul v11.8h, v24.8h, v1.h[0] // ............................................................e...............................|.............................................................e............................. + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ..............................................................................e.............|...............................................................................e........... + // mls v11.8h, v24.8h, v7.h[0] // ......................................................................................e.....|.......................................................................................e... + // sub v24.8h, v12.8h, v13.8h // .............................................................e..............................|..............................................................e............................ + // add v12.8h, v12.8h, v13.8h // ............................................................................................*........................................................................................... + // mul v13.8h, v24.8h, v1.h[2] // ............................................................................................|*.......................................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // ........................................................................................e...|.........................................................................................e. + // mls v13.8h, v24.8h, v7.h[0] // ..........*.................................................................................|...........*............................................................................... + // sub v24.8h, v14.8h, v15.8h // ........*...................................................................................|.........*................................................................................. + // add v14.8h, v14.8h, v15.8h // ..*.........................................................................................|...*....................................................................................... + // mul v15.8h, v24.8h, v1.h[4] // ................*...........................................................................|.................*......................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[5] // .................*..........................................................................|..................*........................................................................ + // mls v15.8h, v24.8h, v7.h[0] // ......................*.....................................................................|.......................*................................................................... + // sub v24.8h, v8.8h, v10.8h // .................................................................................e..........|..................................................................................e........ + // add v8.8h, v8.8h, v10.8h // ...................................................................................e........|....................................................................................e...... + // mul v10.8h, v24.8h, v0.h[2] // .........*..................................................................................|..........*................................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........................................................................................e|........................................................................................... + // mls v10.8h, v24.8h, v7.h[0] // ........................*...................................................................|.........................*................................................................. + // sub v24.8h, v9.8h, v11.8h // .....*......................................................................................|......*.................................................................................... + // add v9.8h, v9.8h, v11.8h // ..........................................*.................................................|...........................................*............................................... + // mul v11.8h, v24.8h, v0.h[2] // ..................*.........................................................................|...................*....................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .............*..............................................................................|..............*............................................................................ + // mls v11.8h, v24.8h, v7.h[0] // .............................*..............................................................|..............................*............................................................ + // sub v24.8h, v12.8h, v14.8h // ..............*.............................................................................|...............*........................................................................... + // add v12.8h, v12.8h, v14.8h // ...........*................................................................................|............*.............................................................................. + // mul v14.8h, v24.8h, v0.h[4] // ...................*........................................................................|....................*...................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .......................*....................................................................|........................*.................................................................. + // mls v14.8h, v24.8h, v7.h[0] // ............................*...............................................................|.............................*............................................................. + // sub v24.8h, v13.8h, v15.8h // .............................................*..............................................|..............................................*............................................ + // add v13.8h, v13.8h, v15.8h // ..............................*.............................................................|...............................*........................................................... + // mul v15.8h, v24.8h, v0.h[4] // .................................................*..........................................|..................................................*........................................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ................................................*...........................................|.................................................*......................................... + // mls v15.8h, v24.8h, v7.h[0] // .........................................................*..................................|..........................................................*................................ + // sqdmulh v25.8h, v8.8h, v7.h[1] // ..........................................................................................e.|........................................................................................... + // srshr v25.8h, v25.8h, #11 // .....................*......................................................................|......................*.................................................................... + // mls v8.8h, v25.8h, v7.h[0] // ..........................*.................................................................|...........................*............................................................... + // sqdmulh v25.8h, v12.8h, v7.h[1] // ...............*............................................................................|................*.......................................................................... + // srshr v25.8h, v25.8h, #11 // ....................*.......................................................................|.....................*..................................................................... + // mls v12.8h, v25.8h, v7.h[0] // ...........................*................................................................|............................*.............................................................. + // sub v24.8h, v8.8h, v12.8h // ................................*...........................................................|.................................*......................................................... + // add v8.8h, v8.8h, v12.8h // .................................*..........................................................|..................................*........................................................ + // mul v12.8h, v24.8h, v0.h[0] // ....................................*.......................................................|.....................................*..................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................*.............................................|...............................................*........................................... + // mls v12.8h, v24.8h, v7.h[0] // ..............................................................*.............................|...............................................................*........................... + // sub v24.8h, v9.8h, v13.8h // ...............................................*............................................|................................................*.......................................... + // add v9.8h, v9.8h, v13.8h // .....................................................*......................................|......................................................*.................................... + // mul v13.8h, v24.8h, v0.h[0] // ........................................................*...................................|.........................................................*................................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..................................................*.........................................|...................................................*....................................... + // mls v13.8h, v24.8h, v7.h[0] // ..................................................................*.........................|...................................................................*....................... + // sub v24.8h, v10.8h, v14.8h // ...................................*........................................................|....................................*...................................................... + // add v10.8h, v10.8h, v14.8h // ..................................*.........................................................|...................................*....................................................... + // mul v14.8h, v24.8h, v0.h[0] // ........................................................................*...................|.........................................................................*................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .........................................*..................................................|..........................................*................................................ + // mls v14.8h, v24.8h, v7.h[0] // ...............................................................................*............|................................................................................*.......... + // sub v24.8h, v11.8h, v15.8h // ................................................................*...........................|.................................................................*......................... + // add v11.8h, v11.8h, v15.8h // ...............................................................*............................|................................................................*.......................... + // mul v15.8h, v24.8h, v0.h[0] // .....................................................................*......................|......................................................................*.................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ......................................................................*.....................|.......................................................................*................... + // mls v15.8h, v24.8h, v7.h[0] // .............................................................................*..............|..............................................................................*............ + // str q12, [x0, #(4*(512/8))] // ...................................................................*........................|....................................................................*...................... + // str q13, [x0, #(5*(512/8))] // .........................................................................*..................|..........................................................................*................ + // str q14, [x0, #(6*(512/8))] // .......................................................................................*....|........................................................................................*.. + // str q15, [x0, #(7*(512/8))] // ....................................................................................*.......|.....................................................................................*..... + // mul v12.8h, v8.8h, v29.8h // ......................................*.....................................................|.......................................*................................................... + // sqrdmulh v8.8h, v8.8h, v30.8h // .....................................*......................................................|......................................*.................................................... + // mls v12.8h, v8.8h, v7.h[0] // ............................................*...............................................|.............................................*............................................. + // mul v13.8h, v9.8h, v29.8h // .................................................................*..........................|..................................................................*........................ + // sqrdmulh v9.8h, v9.8h, v30.8h // ..........................................................*.................................|...........................................................*............................... + // mls v13.8h, v9.8h, v7.h[0] // ...........................................................................*................|............................................................................*.............. + // mul v14.8h, v10.8h, v29.8h // ...........................................*................................................|............................................*.............................................. + // sqrdmulh v10.8h, v10.8h, v30.8h // .......................................*....................................................|........................................*.................................................. + // mls v14.8h, v10.8h, v7.h[0] // ....................................................*.......................................|.....................................................*..................................... + // mul v15.8h, v11.8h, v29.8h // .......................................................................*....................|........................................................................*.................. + // sqrdmulh v11.8h, v11.8h, v30.8h // ..........................................................................*.................|...........................................................................*............... + // mls v15.8h, v11.8h, v7.h[0] // ..................................................................................*.........|...................................................................................*....... + // str q12, [x0], #(16) // ...................................................*........................................|....................................................*...................................... + // str q13, [x0, #(-16 + 1*(512/8))] // ................................................................................*...........|.................................................................................*......... + // str q14, [x0, #(-16 + 2*(512/8))] // ...........................................................*................................|............................................................*.............................. + // str q15, [x0, #(-16 + 3*(512/8))] // .........................................................................................*..|..........................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + add v16.8H, v15.8H, v3.8H // *..................................................................... + mul v23.8H, v18.8H, v1.H[2] // .*.................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v21.8H, v9.8H, v0.H[2] // .....*................................................................ + srshr v22.8H, v2.8H, #11 // ................*..................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sub v10.8H, v14.8H, v20.8H // ...*.................................................................. + // gap // ...................................................................... + // gap // ...................................................................... + sub v15.8H, v5.8H, v11.8H // ....*................................................................. + add v20.8H, v14.8H, v20.8H // ..................................*................................... + add v18.8H, v5.8H, v11.8H // ..*................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v13.8H, v15.8H, v1.H[4] // ...........*.......................................................... + sqrdmulh v26.8H, v15.8H, v1.H[5] // ............*......................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sub v19.8H, v16.8H, v18.8H // .........*............................................................ + add v16.8H, v16.8H, v18.8H // .......*.............................................................. + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v24.8H, v10.8H, v0.H[3] // ........*............................................................. + mls v23.8H, v6.8H, v7.H[0] // ......*............................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v13.8H, v26.8H, v7.H[0] // .................*.................................................... + sqdmulh v4.8H, v16.8H, v7.H[1] // ..........*........................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v3.8H, v19.8H, v0.H[4] // ..............*....................................................... + sqrdmulh v2.8H, v19.8H, v0.H[5] // ..................*................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v17.8H, v10.8H, v0.H[2] // .............*........................................................ + mls v25.8H, v22.8H, v7.H[0] // ....................*................................................. + // gap // ...................................................................... + // gap // ...................................................................... + srshr v11.8H, v4.8H, #11 // ...............*...................................................... + mls v21.8H, v8.8H, v7.H[0] // ...................*.................................................. + // gap // ...................................................................... + // gap // ...................................................................... + mls v3.8H, v2.8H, v7.H[0] // ......................*............................................... + add v2.8H, v23.8H, v13.8H // ........................*............................................. + // gap // ...................................................................... + // gap // ...................................................................... + sub v23.8H, v23.8H, v13.8H // .....................................*................................ + mls v17.8H, v24.8H, v7.H[0] // .......................*.............................................. + // gap // ...................................................................... + // gap // ...................................................................... + mls v16.8H, v11.8H, v7.H[0] // .....................*................................................ + sub v4.8H, v20.8H, v2.8H // .......................................*.............................. + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v10.8H, v23.8H, v0.H[5] // ........................................*............................. + mul v23.8H, v23.8H, v0.H[4] // .........................................*............................ + // gap // ...................................................................... + // gap // ...................................................................... + add v26.8H, v21.8H, v3.8H // ...........................*.......................................... + add v2.8H, v20.8H, v2.8H // .............................................*........................ + // gap // ...................................................................... + // gap // ...................................................................... + sub v20.8H, v25.8H, v16.8H // .........................*............................................ + add v16.8H, v25.8H, v16.8H // ..........................*........................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v11.8H, v26.8H, v30.8H // ................................*..................................... + mul v26.8H, v26.8H, v29.8H // ...................................*.................................. + // gap // ...................................................................... + // gap // ...................................................................... + mul v13.8H, v20.8H, v0.H[0] // .............................*........................................ + sqrdmulh v20.8H, v20.8H, v0.H[1] // ......................................*............................... + // gap // ...................................................................... + // gap // ...................................................................... + sub v21.8H, v21.8H, v3.8H // ............................*......................................... + sqrdmulh v3.8H, v16.8H, v30.8H // ..............................*....................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v26.8H, v11.8H, v7.H[0] // ............................................*......................... + mul v16.8H, v16.8H, v29.8H // ...............................*...................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v11.8H, v2.8H, v30.8H // ................................................*..................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v13.8H, v20.8H, v7.H[0] // ..................................................*................... + mul v2.8H, v2.8H, v29.8H // .....................................................*................ + mls v23.8H, v10.8H, v7.H[0] // ...............................................*...................... + // gap // ...................................................................... + // gap // ...................................................................... + str q26, [x0, #128] // .................................................*.................... + sqrdmulh v19.8H, v4.8H, v0.H[1] // ..........................................*........................... + // gap // ...................................................................... + mls v16.8H, v3.8H, v7.H[0] // ....................................*................................. + sqrdmulh v25.8H, v21.8H, v0.H[1] // .................................*.................................... + str q13, [x0, #256] // .......................................................*.............. + mul v21.8H, v21.8H, v0.H[0] // ...........................................................*.......... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v2.8H, v11.8H, v7.H[0] // ..............................................................*....... + str q16, [x0], #(16) // ...........................................*.......................... + sub v16.8H, v17.8H, v23.8H // ....................................................*................. + add v23.8H, v17.8H, v23.8H // ...................................................*.................. + // gap // ...................................................................... + mls v21.8H, v25.8H, v7.H[0] // ................................................................*..... + mul v4.8H, v4.8H, v0.H[0] // ..............................................*....................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v20.8H, v23.8H, v29.8H // ..........................................................*........... + sqrdmulh v23.8H, v23.8H, v30.8H // .............................................................*........ + str q2, [x0, #48] // .................................................................*.... + // gap // ...................................................................... + mul v26.8H, v16.8H, v0.H[0] // ........................................................*............. + sqrdmulh v16.8H, v16.8H, v0.H[1] // .........................................................*............ + // gap // ...................................................................... + // gap // ...................................................................... + str q21, [x0, #368] // ....................................................................*. + mls v4.8H, v19.8H, v7.H[0] // ......................................................*............... + // gap // ...................................................................... + // gap // ...................................................................... + mls v20.8H, v23.8H, v7.H[0] // ..................................................................*... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v26.8H, v16.8H, v7.H[0] // ...............................................................*...... + str q4, [x0, #304] // ............................................................*......... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q20, [x0, #176] // .....................................................................* + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q26, [x0, #432] // ...................................................................*.. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + + // original source code + // add v17.8H, v15.8H, v3.8H // *..................................................................... + // mul v22.8H, v18.8H, v1.H[2] // .*.................................................................... + // add v23.8H, v5.8H, v11.8H // .......*.............................................................. + // sub v18.8H, v14.8H, v20.8H // ....*................................................................. + // sub v4.8H, v5.8H, v11.8H // .....*................................................................ + // mul v19.8H, v9.8H, v0.H[2] // ..*................................................................... + // mls v22.8H, v6.8H, v7.H[0] // .............*........................................................ + // add v6.8H, v17.8H, v23.8H // ...........*.......................................................... + // sqrdmulh v24.8H, v18.8H, v0.H[3] // ............*......................................................... + // sub v27.8H, v17.8H, v23.8H // ..........*........................................................... + // sqdmulh v16.8H, v6.8H, v7.H[1] // ...............*...................................................... + // mul v13.8H, v4.8H, v1.H[4] // ........*............................................................. + // sqrdmulh v26.8H, v4.8H, v1.H[5] // .........*............................................................ + // mul v17.8H, v18.8H, v0.H[2] // ..................*................................................... + // mul v12.8H, v27.8H, v0.H[4] // ................*..................................................... + // srshr v16.8H, v16.8H, #11 // ....................*................................................. + // srshr v2.8H, v2.8H, #11 // ...*.................................................................. + // mls v13.8H, v26.8H, v7.H[0] // ..............*....................................................... + // sqrdmulh v23.8H, v27.8H, v0.H[5] // .................*.................................................... + // mls v19.8H, v8.8H, v7.H[0] // .....................*................................................ + // mls v25.8H, v2.8H, v7.H[0] // ...................*.................................................. + // mls v6.8H, v16.8H, v7.H[0] // ..........................*........................................... + // mls v12.8H, v23.8H, v7.H[0] // ......................*............................................... + // mls v17.8H, v24.8H, v7.H[0] // .........................*............................................ + // add v26.8H, v22.8H, v13.8H // .......................*.............................................. + // sub v18.8H, v25.8H, v6.8H // ................................*..................................... + // add v25.8H, v25.8H, v6.8H // .................................*.................................... + // add v8.8H, v19.8H, v12.8H // ..............................*....................................... + // sub v23.8H, v19.8H, v12.8H // ......................................*............................... + // mul v5.8H, v18.8H, v0.H[0] // ....................................*................................. + // sqrdmulh v2.8H, v25.8H, v30.8H // .......................................*.............................. + // mul v25.8H, v25.8H, v29.8H // .........................................*............................ + // sqrdmulh v16.8H, v8.8H, v30.8H // ..................................*................................... + // sqrdmulh v9.8H, v23.8H, v0.H[1] // .................................................*.................... + // add v24.8H, v14.8H, v20.8H // ......*............................................................... + // mul v19.8H, v8.8H, v29.8H // ...................................*.................................. + // mls v25.8H, v2.8H, v7.H[0] // ................................................*..................... + // sub v12.8H, v22.8H, v13.8H // ........................*............................................. + // sqrdmulh v13.8H, v18.8H, v0.H[1] // .....................................*................................ + // sub v22.8H, v24.8H, v26.8H // ...........................*.......................................... + // sqrdmulh v20.8H, v12.8H, v0.H[5] // ............................*......................................... + // mul v21.8H, v12.8H, v0.H[4] // .............................*........................................ + // sqrdmulh v10.8H, v22.8H, v0.H[1] // ...............................................*...................... + // str q25, [x0], #(16) // .....................................................*................ + // mls v19.8H, v16.8H, v7.H[0] // ........................................*............................. + // add v2.8H, v24.8H, v26.8H // ...............................*...................................... + // mul v25.8H, v22.8H, v0.H[0] // .........................................................*............ + // mls v21.8H, v20.8H, v7.H[0] // .............................................*........................ + // sqrdmulh v24.8H, v2.8H, v30.8H // ..........................................*........................... + // str q19, [x0, #112] // ..............................................*....................... + // mls v5.8H, v13.8H, v7.H[0] // ...........................................*.......................... + // add v26.8H, v17.8H, v21.8H // .......................................................*.............. + // sub v13.8H, v17.8H, v21.8H // ......................................................*............... + // mul v17.8H, v2.8H, v29.8H // ............................................*......................... + // mls v25.8H, v10.8H, v7.H[0] // ................................................................*..... + // str q5, [x0, #240] // ..................................................*................... + // mul v10.8H, v13.8H, v0.H[0] // .............................................................*........ + // sqrdmulh v2.8H, v13.8H, v0.H[1] // ..............................................................*....... + // mul v16.8H, v26.8H, v29.8H // ..........................................................*........... + // mul v13.8H, v23.8H, v0.H[0] // ...................................................*.................. + // str q25, [x0, #304] // ...................................................................*.. + // sqrdmulh v22.8H, v26.8H, v30.8H // ...........................................................*.......... + // mls v17.8H, v24.8H, v7.H[0] // ....................................................*................. + // mls v10.8H, v2.8H, v7.H[0] // ..................................................................*... + // mls v13.8H, v9.8H, v7.H[0] // ........................................................*............. + // str q17, [x0, #48] // ............................................................*......... + // mls v16.8H, v22.8H, v7.H[0] // .................................................................*.... + // str q10, [x0, #432] // .....................................................................* + // str q13, [x0, #368] // ...............................................................*...... + // str q16, [x0, #176] // ....................................................................*. + + + pop_stack + ret \ No newline at end of file From c600d499bf3f6ada57272f413d8dbb1ee4b4fb80 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Sat, 23 Mar 2024 17:52:18 -0400 Subject: [PATCH 13/18] Format Kyber tests --- tests/ntt_kyber/main.c | 123 ++++++++++------------------------------- 1 file changed, 29 insertions(+), 94 deletions(-) diff --git a/tests/ntt_kyber/main.c b/tests/ntt_kyber/main.c index 52bd561..d8ac741 100644 --- a/tests/ntt_kyber/main.c +++ b/tests/ntt_kyber/main.c @@ -515,99 +515,40 @@ int main( void ) debug_printf( "ok\n" ); #if defined(DO_TEST) - if (test_ntt_asm() != 0) - { - return (1); - } - - if (test_ntt_asm_123_4567_scalar_load() != 0) - { - return (1); - } - - if (test_ntt_asm_123_4567_scalar_load_store() != 0) - { - return (1); - } - - if (test_ntt_asm_123_4567_scalar_store() != 0) - { - return (1); - } - - if (test_ntt_asm_1234_567() != 0) - { - return (1); - } - - if (test_ntt_asm_123_4567_inv() != 0) - { - return (1); - } + /* Clean */ + if (test_ntt_asm() != 0){return (1);} + if (test_ntt_asm_123_4567_scalar_load() != 0){return (1);} + if (test_ntt_asm_123_4567_scalar_load_store() != 0){return (1);} + if (test_ntt_asm_123_4567_scalar_store() != 0){return (1);} + if (test_ntt_asm_1234_567() != 0){return (1);} - if (test_ntt_asm_123_4567_inv_manual_ld4() != 0) - { - return (1); - } + if (test_ntt_asm_123_4567_inv() != 0){return (1);} + if (test_ntt_asm_123_4567_inv_manual_ld4() != 0){return (1);} if (test_ntt_asm_vs_pqclean_123_4567_inv() != 0){return (1);} if (test_ntt_asm_vs_pqclean_123_4567_inv_manual_ld4() != 0){return (1);} - if (test_ntt_asm_123_4567_manual_st4_opt_a55() != 0) - { - return (1); - } - - if (test_ntt_asm_123_4567_opt_a55() != 0) - { - return (1); - } - - if (test_ntt_asm_123_4567_scalar_load_opt_a55() != 0) - { - return (1); - } - - if (test_ntt_asm_123_4567_scalar_load_store_opt_a55() != 0) - { - return (1); - } - - if (test_ntt_asm_123_4567_scalar_store_opt_a55() != 0) - { - return (1); - } + /* A55 */ + if (test_ntt_asm_123_4567_manual_st4_opt_a55() != 0){return (1);} + if (test_ntt_asm_123_4567_opt_a55() != 0){return (1);} + if (test_ntt_asm_123_4567_scalar_load_opt_a55() != 0){return (1);} + if (test_ntt_asm_123_4567_scalar_load_store_opt_a55() != 0){return (1);} + if (test_ntt_asm_123_4567_scalar_store_opt_a55() != 0){return (1);} if (test_ntt_asm_123_4567_inv_opt_a55() != 0){return (1);} if (test_ntt_asm_123_4567_inv_manual_ld4_opt_a55() != 0){return (1);} + + /* A72 */ + if (test_ntt_asm_123_4567_manual_st4_opt_a72() != 0){return (1);} + if (test_ntt_asm_123_4567_opt_a72() != 0){return (1);} + if (test_ntt_asm_123_4567_scalar_load_opt_a72() != 0){return (1);} + if (test_ntt_asm_123_4567_scalar_load_store_opt_a72() != 0){return (1);} + if (test_ntt_asm_123_4567_scalar_store_opt_a72() != 0){return (1);} - if (test_ntt_asm_123_4567_manual_st4_opt_a72() != 0) - { - return (1); - } - - if (test_ntt_asm_123_4567_opt_a72() != 0) - { - return (1); - } - - if (test_ntt_asm_123_4567_scalar_load_opt_a72() != 0) - { - return (1); - } - - if (test_ntt_asm_123_4567_scalar_load_store_opt_a72() != 0) - { - return (1); - } - - if (test_ntt_asm_123_4567_scalar_store_opt_a72() != 0) - { - return (1); - } if (test_ntt_asm_123_4567_inv_opt_a72() != 0){return (1);} if (test_ntt_asm_123_4567_inv_manual_ld4_opt_a72() != 0){return (1);} - // M1 Firestorm + + /* M1 Firestorm */ if(test_ntt_asm_123_4567_opt_m1_firestorm() != 0){return (1);} if(test_ntt_asm_123_4567_scalar_load_opt_m1_firestorm() != 0){return (1);} if(test_ntt_asm_123_4567_scalar_load_store_opt_m1_firestorm() != 0){return (1);} @@ -615,10 +556,11 @@ int main( void ) if(test_ntt_asm_123_4567_scalar_store_opt_m1_firestorm() != 0){return (1);} /* if(test_ntt_asm_1234_567_opt_m1_firestorm() != 0){return (1);} */ /* if(test_ntt_asm_1234_567_manual_st4_opt_m1_firestorm() != 0){return (1);} */ + if (test_ntt_asm_123_4567_inv_opt_m1_firestorm() != 0){return (1);} if (test_ntt_asm_123_4567_inv_manual_ld4_opt_m1_firestorm() != 0){return (1);} - // M1 Icestorm + /* M1 Icestorm */ if(test_ntt_asm_123_4567_manual_st4_opt_m1_icestorm() != 0){return (1);} if(test_ntt_asm_123_4567_opt_m1_icestorm() != 0){return (1);} if(test_ntt_asm_123_4567_scalar_load_opt_m1_icestorm() != 0){return (1);} @@ -627,16 +569,9 @@ int main( void ) /* if(test_ntt_asm_1234_567_opt_m1_icestorm() != 0){return (1);} */ /* if(test_ntt_asm_1234_567_manual_st4_opt_m1_icestorm() != 0){return (1);} */ - /* Neon NTT */ - if(test_ntt_neonntt()!= 0) - { - return(1); - } - - if(test_ntt_neonntt_inv()!= 0) - { - return(1); - } + /* Other */ + if(test_ntt_neonntt()!= 0){return(1);} + if(test_ntt_neonntt_inv()!= 0){return(1);} if( test_ntt_pqclean()!= 0 ){return(1);} if( test_ntt_pqclean_inv()!= 0 ){return(1);} #endif /* DO_TEST */ @@ -689,7 +624,7 @@ int main( void ) bench_ntt_asm_123_4567_inv_opt_m1_icestorm(); bench_ntt_asm_123_4567_inv_manual_ld4_opt_m1_icestorm(); - + /* Other */ bench_ntt_neonntt(); bench_ntt_pqclean(); bench_ntt_neonntt_inv(); From 785253f729ca14db93a820e0c9eeeecb61b4b198 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Sat, 23 Mar 2024 17:53:34 -0400 Subject: [PATCH 14/18] Add Icestorm Kyber test --- tests/ntt_kyber/main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/ntt_kyber/main.c b/tests/ntt_kyber/main.c index d8ac741..1ef71e8 100644 --- a/tests/ntt_kyber/main.c +++ b/tests/ntt_kyber/main.c @@ -568,6 +568,9 @@ int main( void ) if(test_ntt_asm_123_4567_scalar_store_opt_m1_icestorm() != 0){return (1);} /* if(test_ntt_asm_1234_567_opt_m1_icestorm() != 0){return (1);} */ /* if(test_ntt_asm_1234_567_manual_st4_opt_m1_icestorm() != 0){return (1);} */ + + if (test_ntt_asm_123_4567_inv_opt_m1_icestorm() != 0){return (1);} + if (test_ntt_asm_123_4567_inv_manual_ld4_opt_m1_icestorm() != 0){return (1);} /* Other */ if(test_ntt_neonntt()!= 0){return(1);} From 534ac611374e870527e16e0a7324e9019b829369 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Tue, 2 Apr 2024 15:58:59 +0200 Subject: [PATCH 15/18] Update Kyber and Dilithium tests for invNTT --- ...um_1234_5678_manual_ld4_opt_m1_firestorm.s | 1 + ...ium_1234_5678_manual_ld4_opt_m1_icestorm.s | 1 + ...intt_dilithium_1234_5678_opt_m1_icestorm.s | 1 + ...ium_123_45678_manual_ld4_opt_m1_icestorm.s | 1 + ...intt_dilithium_123_45678_opt_m1_icestorm.s | 1 + tests/ntt_dilithium/main.c | 295 +- .../manual/intt_dilithium_1234_5678.s | 16 +- .../intt_dilithium_1234_5678_manual_ld4.s | 16 +- ...um_1234_5678_manual_ld4_opt_m1_firestorm.s | 1934 ++++++++ ...ium_1234_5678_manual_ld4_opt_m1_icestorm.s | 1710 +++++++ ...intt_dilithium_1234_5678_opt_m1_icestorm.s | 1764 ++++++++ .../manual/intt_dilithium_123_45678.s | 28 +- .../intt_dilithium_123_45678_manual_ld4.s | 28 +- ...t_dilithium_123_45678_manual_ld4_opt_a55.s | 3545 ++++++++------- ...t_dilithium_123_45678_manual_ld4_opt_a72.s | 4019 +++++++++-------- ...um_123_45678_manual_ld4_opt_m1_firestorm.s | 3879 ++++++++-------- ...ium_123_45678_manual_ld4_opt_m1_icestorm.s | 2091 +++++++++ .../manual/intt_dilithium_123_45678_opt_a55.s | 3353 +++++++------- .../manual/intt_dilithium_123_45678_opt_a72.s | 3944 ++++++++-------- ...ntt_dilithium_123_45678_opt_m1_firestorm.s | 3671 +++++++-------- ...intt_dilithium_123_45678_opt_m1_icestorm.s | 1999 ++++++++ .../manual/ntt_dilithium_1234_5678.s | 18 +- .../manual/ntt_dilithium_123_45678.s | 16 +- .../ntt_dilithium_123_45678_manual_st4.s | 16 +- .../manual/ntt_dilithium_123_45678_w_scalar.s | 16 +- tests/ntt_kyber/main.c | 1 + tests/ntt_kyber/manual/intt_kyber_123_4567.s | 5 +- .../manual/intt_kyber_123_4567_manual_ld4.s | 5 +- .../intt_kyber_123_4567_manual_ld4_opt_a55.s | 2160 ++++----- .../intt_kyber_123_4567_manual_ld4_opt_a72.s | 2842 ++++++------ ...ber_123_4567_manual_ld4_opt_m1_firestorm.s | 2548 ++++++----- ...yber_123_4567_manual_ld4_opt_m1_icestorm.s | 2048 ++++----- .../manual/intt_kyber_123_4567_opt_a55.s | 2190 ++++----- .../manual/intt_kyber_123_4567_opt_a72.s | 2718 ++++++----- .../intt_kyber_123_4567_opt_m1_firestorm.s | 2700 +++++------ .../intt_kyber_123_4567_opt_m1_icestorm.s | 1906 ++++---- 36 files changed, 30981 insertions(+), 20505 deletions(-) create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_1234_5678_opt_m1_icestorm.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s create mode 120000 asm/manual/ntt_dilithium/intt_dilithium_123_45678_opt_m1_icestorm.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_m1_icestorm.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s create mode 100644 tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_icestorm.s diff --git a/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s new file mode 120000 index 0000000..5ab4de7 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s new file mode 120000 index 0000000..4e67ca6 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_opt_m1_icestorm.s b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_opt_m1_icestorm.s new file mode 120000 index 0000000..10bb8b5 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_1234_5678_opt_m1_icestorm.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_dilithium_1234_5678_opt_m1_icestorm.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s new file mode 120000 index 0000000..78ea2e4 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s \ No newline at end of file diff --git a/asm/manual/ntt_dilithium/intt_dilithium_123_45678_opt_m1_icestorm.s b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_opt_m1_icestorm.s new file mode 120000 index 0000000..8584f70 --- /dev/null +++ b/asm/manual/ntt_dilithium/intt_dilithium_123_45678_opt_m1_icestorm.s @@ -0,0 +1 @@ +../../../slothy/examples/opt/aarch64/intt_dilithium_123_45678_opt_m1_icestorm.s \ No newline at end of file diff --git a/tests/ntt_dilithium/main.c b/tests/ntt_dilithium/main.c index 58f1488..5db247d 100644 --- a/tests/ntt_dilithium/main.c +++ b/tests/ntt_dilithium/main.c @@ -74,17 +74,17 @@ void ntt_dilithium_1234_5678_manual_st4_opt_m1_firestorm(int32_t *); void intt_dilithium_123_45678_opt_m1_firestorm(int32_t *); void intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm(int32_t *); void intt_dilithium_1234_5678_opt_m1_firestorm(int32_t *); -/* void intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm(int32_t *); */ // not done yet +void intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm(int32_t *); // M1 Icestorm void ntt_dilithium_123_45678_manual_st4_opt_m1_icestorm(int32_t *); void ntt_dilithium_123_45678_opt_m1_icestorm(int32_t *); void ntt_dilithium_123_45678_w_scalar_opt_m1_icestorm(int32_t *); void ntt_dilithium_1234_5678_opt_m1_icestorm(int32_t *); void ntt_dilithium_1234_5678_manual_st4_opt_m1_icestorm(int32_t *); -/* void intt_dilithium_123_45678_opt_m1_icestorm(int32_t *); +void intt_dilithium_123_45678_opt_m1_icestorm(int32_t *); void intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm(int32_t *); void intt_dilithium_1234_5678_opt_m1_icestorm(int32_t *); -void intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm(int32_t *); */ // not done yet +void intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm(int32_t *); #define NTT_LAYERS 8 #define NTT_SIZE (1u << NTT_LAYERS) @@ -353,42 +353,47 @@ MAKE_TEST(asm_123_45678_w_scalar,0,ntt_dilithium_123_45678_w_scalar,ntt_u32_C,0, MAKE_TEST(asm_123_45678_manual_st4,0,ntt_dilithium_123_45678_manual_st4,ntt_u32_C,0,0,1) MAKE_TEST(asm_1234_5678,0,ntt_dilithium_1234_5678,ntt_u32_C,0,0,1) MAKE_TEST(asm_1234_5678_manual_st4,0,ntt_dilithium_1234_5678_manual_st4,ntt_u32_C,0,0,1) + MAKE_TEST(asm_1234_5678_inv,1,intt_dilithium_1234_5678,invntt_u32_tomont_C,0,1,1) MAKE_TEST(asm_1234_5678_inv_manual_ld4,1,intt_dilithium_1234_5678_manual_ld4,invntt_u32_tomont_C,0,1,1) MAKE_TEST(asm_123_45678_inv,1,intt_dilithium_123_45678,invntt_u32_tomont_C,0,1,1) MAKE_TEST(asm_123_45678_inv_manual_ld4,1,intt_dilithium_123_45678_manual_ld4,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_vs_pqclean_1234_5678_inv,1,intt_dilithium_1234_5678,pqclean_invntt_tomont,0,1,0) +MAKE_TEST(asm_vs_pqclean_1234_5678_inv_manual_ld4,1,intt_dilithium_1234_5678_manual_ld4,pqclean_invntt_tomont,0,1,0) +MAKE_TEST(asm_vs_pqclean_123_45678_inv,1,intt_dilithium_123_45678,pqclean_invntt_tomont,0,1,0) +MAKE_TEST(asm_vs_pqclean_123_45678_inv_manual_ld4,1,intt_dilithium_123_45678_manual_ld4,pqclean_invntt_tomont,0,1,0) + // A55 MAKE_TEST(asm_123_45678_opt_a55,0,ntt_dilithium_123_45678_opt_a55,ntt_u32_C,0,0,1) MAKE_TEST(asm_123_45678_manual_st4_opt_a55,0,ntt_dilithium_123_45678_manual_st4_opt_a55,ntt_u32_C,0,0,1) MAKE_TEST(asm_123_45678_w_scalar_opt_a55,0,ntt_dilithium_123_45678_w_scalar_opt_a55,ntt_u32_C,0,0,1) -MAKE_TEST(asm_123_45678_inv_opt_a55,1,intt_dilithium_123_45678_opt_a55,invntt_u32_tomont_C,0,1,1) -MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_a55,1,intt_dilithium_123_45678_manual_ld4_opt_a55,invntt_u32_tomont_C,0,1,1) -MAKE_TEST(asm_1234_5678_inv_opt_a55,1,intt_dilithium_1234_5678_opt_a55,invntt_u32_tomont_C,0,1,1) -MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_a55,1,intt_dilithium_1234_5678_manual_ld4_opt_a55,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_123_45678_inv_opt_a55,1,intt_dilithium_123_45678_opt_a55,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_a55,1,intt_dilithium_123_45678_manual_ld4_opt_a55,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(asm_1234_5678_inv_opt_a55,1,intt_dilithium_1234_5678_opt_a55,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_a55,1,intt_dilithium_1234_5678_manual_ld4_opt_a55,invntt_u32_tomont_C,0,0,1) // A72 MAKE_TEST(asm_123_45678_opt_a72,0,ntt_dilithium_123_45678_opt_a72,ntt_u32_C,0,0,1) MAKE_TEST(asm_123_45678_manual_st4_opt_a72,0,ntt_dilithium_123_45678_manual_st4_opt_a72,ntt_u32_C,0,0,1) MAKE_TEST(asm_1234_5678_opt_a72,0,ntt_dilithium_1234_5678_opt_a72,ntt_u32_C,0,0,1) -MAKE_TEST(asm_123_45678_inv_opt_a72,1,intt_dilithium_123_45678_opt_a72,invntt_u32_tomont_C,0,1,1) -MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_a72,1,intt_dilithium_123_45678_manual_ld4_opt_a72,invntt_u32_tomont_C,0,1,1) -MAKE_TEST(asm_1234_5678_inv_opt_a72,1,intt_dilithium_1234_5678_opt_a72,invntt_u32_tomont_C,0,1,1) -MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_a72,1,intt_dilithium_1234_5678_manual_ld4_opt_a72,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_123_45678_inv_opt_a72,1,intt_dilithium_123_45678_opt_a72,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_a72,1,intt_dilithium_123_45678_manual_ld4_opt_a72,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(asm_1234_5678_inv_opt_a72,1,intt_dilithium_1234_5678_opt_a72,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_a72,1,intt_dilithium_1234_5678_manual_ld4_opt_a72,invntt_u32_tomont_C,0,0,1) // M1 Firestorm MAKE_TEST(asm_123_45678_opt_m1_firestorm,0,ntt_dilithium_123_45678_opt_m1_firestorm,ntt_u32_C,0,0,1) MAKE_TEST(asm_123_45678_manual_st4_opt_m1_firestorm,0,ntt_dilithium_123_45678_manual_st4_opt_m1_firestorm,ntt_u32_C,0,0,1) -/* MAKE_TEST(asm_123_45678_w_scalar_opt_m1_firestorm,0,ntt_dilithium_123_45678_w_scalar_opt_m1_firestorm,ntt_u32_C,0,0) */ MAKE_TEST(asm_1234_5678_opt_m1_firestorm,0,ntt_dilithium_1234_5678_opt_m1_firestorm,ntt_u32_C,0,0,1) MAKE_TEST(asm_1234_5678_manual_st4_opt_m1_firestorm,0,ntt_dilithium_1234_5678_manual_st4_opt_m1_firestorm,ntt_u32_C,0,0,1) -MAKE_TEST(asm_123_45678_inv_opt_m1_firestorm,1,intt_dilithium_123_45678_opt_m1_firestorm,invntt_u32_tomont_C,0,1,1) -MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_m1_firestorm,1,intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm,invntt_u32_tomont_C,0,1,1) -MAKE_TEST(asm_1234_5678_inv_opt_m1_firestorm,1,intt_dilithium_1234_5678_opt_m1_firestorm,invntt_u32_tomont_C,0,1,1) -/* MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_m1_firestorm,1,intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm,invntt_u32_tomont_C,0,1) */ +MAKE_TEST(asm_123_45678_inv_opt_m1_firestorm,1,intt_dilithium_123_45678_opt_m1_firestorm,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_m1_firestorm,1,intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(asm_1234_5678_inv_opt_m1_firestorm,1,intt_dilithium_1234_5678_opt_m1_firestorm,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_m1_firestorm,1,intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm,invntt_u32_tomont_C,0,0,1) // M1 Icestorm MAKE_TEST(asm_123_45678_opt_m1_icestorm,0,ntt_dilithium_123_45678_opt_m1_icestorm,ntt_u32_C,0,0,1) @@ -397,10 +402,10 @@ MAKE_TEST(asm_123_45678_w_scalar_opt_m1_icestorm,0,ntt_dilithium_123_45678_w_sca MAKE_TEST(asm_1234_5678_opt_m1_icestorm,0,ntt_dilithium_1234_5678_opt_m1_icestorm,ntt_u32_C,0,0,1) MAKE_TEST(asm_1234_5678_manual_st4_opt_m1_icestorm,0,ntt_dilithium_1234_5678_manual_st4_opt_m1_icestorm,ntt_u32_C,0,0,1) -/* MAKE_TEST(asm_123_45678_inv_opt_m1_icestorm,1,intt_dilithium_123_45678_opt_m1_icestorm,invntt_u32_tomont_C,0,1) -MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_m1_icestorm,1,intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm,invntt_u32_tomont_C,0,1) -MAKE_TEST(asm_1234_5678_inv_opt_m1_icestorm,1,intt_dilithium_1234_5678_opt_m1_icestorm,invntt_u32_tomont_C,0,1) -MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_m1_icestorm,1,intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm,invntt_u32_tomont_C,0,1) */ +MAKE_TEST(asm_123_45678_inv_opt_m1_icestorm,1,intt_dilithium_123_45678_opt_m1_icestorm,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_m1_icestorm,1,intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(asm_1234_5678_inv_opt_m1_icestorm,1,intt_dilithium_1234_5678_opt_m1_icestorm,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_m1_icestorm,1,intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm,invntt_u32_tomont_C,0,0,1) // Other MAKE_TEST(neonntt_fwd,0,ntt,ntt_u32_C,0,0,1) @@ -471,14 +476,13 @@ MAKE_BENCH(asm_1234_5678_inv_manual_ld4_opt_a72,intt_dilithium_1234_5678_manual_ // M1 Firestorm MAKE_BENCH(asm_123_45678_opt_m1_firestorm,ntt_dilithium_123_45678_opt_m1_firestorm) MAKE_BENCH(asm_123_45678_manual_st4_opt_m1_firestorm,ntt_dilithium_123_45678_manual_st4_opt_m1_firestorm) -/* MAKE_BENCH(asm_123_45678_w_scalar_opt_m1_firestorm,ntt_dilithium_123_45678_w_scalar_opt_m1_firestorm) */ MAKE_BENCH(asm_1234_5678_opt_m1_firestorm,ntt_dilithium_1234_5678_opt_m1_firestorm) MAKE_BENCH(asm_1234_5678_manual_st4_opt_m1_firestorm,ntt_dilithium_1234_5678_manual_st4_opt_m1_firestorm) MAKE_BENCH(asm_123_45678_inv_opt_m1_firestorm,intt_dilithium_123_45678_opt_m1_firestorm) MAKE_BENCH(asm_123_45678_inv_manual_ld4_opt_m1_firestorm,intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm) MAKE_BENCH(asm_1234_5678_inv_opt_m1_firestorm,intt_dilithium_1234_5678_opt_m1_firestorm) -/* MAKE_BENCH(asm_1234_5678_inv_manual_ld4_opt_m1_firestorm,intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm) */ +MAKE_BENCH(asm_1234_5678_inv_manual_ld4_opt_m1_firestorm,intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm) // M1 Icestorm MAKE_BENCH(asm_123_45678_opt_m1_icestorm,ntt_dilithium_123_45678_opt_m1_icestorm) @@ -487,10 +491,10 @@ MAKE_BENCH(asm_123_45678_w_scalar_opt_m1_icestorm,ntt_dilithium_123_45678_w_scal MAKE_BENCH(asm_1234_5678_opt_m1_icestorm,ntt_dilithium_1234_5678_opt_m1_icestorm) MAKE_BENCH(asm_1234_5678_manual_st4_opt_m1_icestorm,ntt_dilithium_1234_5678_manual_st4_opt_m1_icestorm) -/* MAKE_BENCH(asm_123_45678_inv_opt_m1_icestorm,intt_dilithium_123_45678_opt_m1_icestorm) +MAKE_BENCH(asm_123_45678_inv_opt_m1_icestorm,intt_dilithium_123_45678_opt_m1_icestorm) MAKE_BENCH(asm_123_45678_inv_manual_ld4_opt_m1_icestorm,intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm) MAKE_BENCH(asm_1234_5678_inv_opt_m1_icestorm,intt_dilithium_1234_5678_opt_m1_icestorm) -MAKE_BENCH(asm_1234_5678_inv_manual_ld4_opt_m1_icestorm,intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm) */ +MAKE_BENCH(asm_1234_5678_inv_manual_ld4_opt_m1_icestorm,intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm) // Other MAKE_BENCH(neonntt_fwd,ntt) MAKE_BENCH(pqclean_ntt_fwd,pqclean_ntt) @@ -534,14 +538,13 @@ int main( void ) // M1 Firestorm bench_ntt_asm_123_45678_opt_m1_firestorm(); bench_ntt_asm_123_45678_manual_st4_opt_m1_firestorm(); - /* bench_ntt_asm_123_45678_w_scalar_opt_m1_firestorm(); */ bench_ntt_asm_1234_5678_opt_m1_firestorm(); bench_ntt_asm_1234_5678_manual_st4_opt_m1_firestorm(); bench_ntt_asm_123_45678_inv_opt_m1_firestorm(); bench_ntt_asm_123_45678_inv_manual_ld4_opt_m1_firestorm(); bench_ntt_asm_1234_5678_inv_opt_m1_firestorm(); - /* bench_ntt_asm_1234_5678_inv_manual_ld4_opt_m1_firestorm(); */ + bench_ntt_asm_1234_5678_inv_manual_ld4_opt_m1_firestorm(); // M1 Icestorm bench_ntt_asm_123_45678_opt_m1_icestorm(); @@ -550,10 +553,10 @@ int main( void ) bench_ntt_asm_1234_5678_opt_m1_icestorm(); bench_ntt_asm_1234_5678_manual_st4_opt_m1_icestorm(); - /* bench_ntt_asm_123_45678_inv_opt_m1_icestorm(); + bench_ntt_asm_123_45678_inv_opt_m1_icestorm(); bench_ntt_asm_123_45678_inv_manual_ld4_opt_m1_icestorm(); bench_ntt_asm_1234_5678_inv_opt_m1_icestorm(); - bench_ntt_asm_1234_5678_inv_manual_ld4_opt_m1_icestorm(); */ + bench_ntt_asm_1234_5678_inv_manual_ld4_opt_m1_icestorm(); // other bench_ntt_neonntt_fwd(); bench_ntt_pqclean_ntt_fwd(); @@ -563,198 +566,68 @@ int main( void ) // Tests debug_printf("Tests:\n"); - // base - if (test_ntt_asm_123_45678() != 0) - { - return 1; - } - if (test_ntt_asm_123_45678_w_scalar() != 0) - { - return 1; - } - if (test_ntt_asm_123_45678_manual_st4() != 0) - { - return 1; - } - if (test_ntt_asm_1234_5678() != 0) - { - return 1; - } - if (test_ntt_asm_1234_5678_manual_st4() != 0) - { - return 1; - } - if (test_ntt_asm_1234_5678_inv() != 0) - { - return 1; - } - if (test_ntt_asm_1234_5678_inv_manual_ld4() != 0) - { - return 1; - } - if (test_ntt_asm_123_45678_inv() != 0) - { - return 1; - } - if (test_ntt_asm_123_45678_inv_manual_ld4() != 0) - { - return 1; - } - // A55 - if (test_ntt_asm_123_45678_opt_a55() != 0) - { - return 1; - } - if (test_ntt_asm_123_45678_manual_st4_opt_a55() != 0) - { - return 1; - } - if (test_ntt_asm_123_45678_w_scalar_opt_a55() != 0) - { - return 1; - } + // Clean + if (test_ntt_asm_123_45678() != 0){return 1;} + if (test_ntt_asm_123_45678_w_scalar() != 0){return 1;} + if (test_ntt_asm_123_45678_manual_st4() != 0){return 1;} + if (test_ntt_asm_1234_5678() != 0){return 1;} + if (test_ntt_asm_1234_5678_manual_st4() != 0){return 1;} + if (test_ntt_asm_1234_5678_inv() != 0){return 1;} + if (test_ntt_asm_1234_5678_inv_manual_ld4() != 0){return 1;} + if (test_ntt_asm_123_45678_inv() != 0){return 1;} + if (test_ntt_asm_123_45678_inv_manual_ld4() != 0){return 1;} + + if (test_ntt_asm_vs_pqclean_1234_5678_inv() != 0){return 1;} + if (test_ntt_asm_vs_pqclean_1234_5678_inv_manual_ld4() != 0){return 1;} + if (test_ntt_asm_vs_pqclean_123_45678_inv() != 0){return 1;} + if (test_ntt_asm_vs_pqclean_123_45678_inv_manual_ld4() != 0){return 1;} - if (test_ntt_asm_123_45678_inv_opt_a55() != 0) - { - return 1; - } - if (test_ntt_asm_123_45678_inv_manual_ld4_opt_a55() != 0) - { - return 1; - } - if (test_ntt_asm_1234_5678_inv_opt_a55() != 0) - { - return 1; - } - if (test_ntt_asm_1234_5678_inv_manual_ld4_opt_a55() != 0) - { - return 1; - } + // A55 + if (test_ntt_asm_123_45678_opt_a55() != 0){return 1;} + if (test_ntt_asm_123_45678_manual_st4_opt_a55() != 0){return 1;} + if (test_ntt_asm_123_45678_w_scalar_opt_a55() != 0){return 1;} + + if (test_ntt_asm_123_45678_inv_opt_a55() != 0){return 1;} + if (test_ntt_asm_123_45678_inv_manual_ld4_opt_a55() != 0){return 1;} + if (test_ntt_asm_1234_5678_inv_opt_a55() != 0){return 1;} + if (test_ntt_asm_1234_5678_inv_manual_ld4_opt_a55() != 0){return 1;} // A72 - if (test_ntt_asm_123_45678_opt_a72() != 0) - { - return 1; - } - if (test_ntt_asm_123_45678_manual_st4_opt_a72() != 0) - { - return 1; - } - if (test_ntt_asm_1234_5678_opt_a72() != 0) - { - return 1; - } + if (test_ntt_asm_123_45678_opt_a72() != 0){return 1;} + if (test_ntt_asm_123_45678_manual_st4_opt_a72() != 0){return 1;} + if (test_ntt_asm_1234_5678_opt_a72() != 0){return 1;} - if (test_ntt_asm_123_45678_inv_opt_a72() != 0) - { - return 1; - } - if (test_ntt_asm_123_45678_inv_manual_ld4_opt_a72() != 0) - { - return 1; - } - if (test_ntt_asm_1234_5678_inv_opt_a72() != 0) - { - return 1; - } - if (test_ntt_asm_1234_5678_inv_manual_ld4_opt_a72() != 0) - { - return 1; - } + if (test_ntt_asm_123_45678_inv_opt_a72() != 0){return 1;} + /* if (test_ntt_asm_123_45678_inv_manual_ld4_opt_a72() != 0){return 1;} */ + if (test_ntt_asm_1234_5678_inv_opt_a72() != 0){return 1;} + if (test_ntt_asm_1234_5678_inv_manual_ld4_opt_a72() != 0){return 1;} // M1 Firestorm - if (test_ntt_asm_123_45678_opt_m1_firestorm() != 0) - { - return 1; - } - if (test_ntt_asm_123_45678_manual_st4_opt_m1_firestorm() != 0) - { - return 1; - } - /* if (test_ntt_asm_123_45678_w_scalar_opt_m1_firestorm() != 0) - { - return 1; - } */ - if (test_ntt_asm_1234_5678_opt_m1_firestorm() != 0) - { - return 1; - } - if (test_ntt_asm_1234_5678_manual_st4_opt_m1_firestorm() != 0) - { - return 1; - } + if (test_ntt_asm_123_45678_opt_m1_firestorm() != 0){return 1;} + if (test_ntt_asm_123_45678_manual_st4_opt_m1_firestorm() != 0){return 1;} + if (test_ntt_asm_1234_5678_opt_m1_firestorm() != 0){return 1;} + if (test_ntt_asm_1234_5678_manual_st4_opt_m1_firestorm() != 0){return 1;} - if (test_ntt_asm_123_45678_inv_opt_m1_firestorm() != 0) - { - return 1; - } - if (test_ntt_asm_123_45678_inv_manual_ld4_opt_m1_firestorm() != 0) - { - return 1; - } - if (test_ntt_asm_1234_5678_inv_opt_m1_firestorm() != 0) - { - return 1; - } - // if (test_ntt_asm_1234_5678_inv_manual_ld4_opt_m1_firestorm() != 0) - // { - // return 1; - // } + if (test_ntt_asm_123_45678_inv_opt_m1_firestorm() != 0){return 1;} + /* if (test_ntt_asm_123_45678_inv_manual_ld4_opt_m1_firestorm() != 0){return 1;} */ + if (test_ntt_asm_1234_5678_inv_opt_m1_firestorm() != 0){return 1;} + if (test_ntt_asm_1234_5678_inv_manual_ld4_opt_m1_firestorm() != 0){return 1;} // M1 Icestorm - if (test_ntt_asm_123_45678_opt_m1_icestorm() != 0) - { - return 1; - } - if (test_ntt_asm_123_45678_manual_st4_opt_m1_icestorm() != 0) - { - return 1; - } - if (test_ntt_asm_123_45678_w_scalar_opt_m1_icestorm() != 0) - { - return 1; - } - if (test_ntt_asm_1234_5678_opt_m1_icestorm() != 0) - { - return 1; - } - if (test_ntt_asm_1234_5678_manual_st4_opt_m1_icestorm() != 0) - { - return 1; - } - - // if (test_ntt_asm_123_45678_inv_opt_m1_icestorm() != 0) - // { - // return 1; - // } - // if (test_ntt_asm_123_45678_inv_manual_ld4_opt_m1_icestorm() != 0) - // { - // return 1; - // } - // if (test_ntt_asm_1234_5678_inv_opt_m1_icestorm() != 0) - // { - // return 1; - // } - // if (test_ntt_asm_1234_5678_inv_manual_ld4_opt_m1_icestorm() != 0) - // { - // return 1; - // } + if (test_ntt_asm_123_45678_opt_m1_icestorm() != 0){return 1;} + if (test_ntt_asm_123_45678_manual_st4_opt_m1_icestorm() != 0){return 1;} + if (test_ntt_asm_123_45678_w_scalar_opt_m1_icestorm() != 0){return 1;} + if (test_ntt_asm_1234_5678_opt_m1_icestorm() != 0){return 1;} + if (test_ntt_asm_1234_5678_manual_st4_opt_m1_icestorm() != 0){return 1;} + + if (test_ntt_asm_123_45678_inv_opt_m1_icestorm() != 0){return 1;} + /* if (test_ntt_asm_123_45678_inv_manual_ld4_opt_m1_icestorm() != 0){return 1;} */ + if (test_ntt_asm_1234_5678_inv_opt_m1_icestorm() != 0){return 1;} + if (test_ntt_asm_1234_5678_inv_manual_ld4_opt_m1_icestorm() != 0){return 1;} // other - if (test_ntt_neonntt_fwd() != 0) - { - return 1; - } - if (test_ntt_pqclean_ntt_fwd() != 0) - { - return 1; - } - if (test_ntt_neonntt_inv() != 0) - { - return 1; - } - if (test_ntt_pqclean_ntt_inv() != 0) - { - return 1; - } + if (test_ntt_neonntt_fwd() != 0){return 1;} + if (test_ntt_pqclean_ntt_fwd() != 0){return 1;} + if (test_ntt_neonntt_inv() != 0){return 1;} + if (test_ntt_pqclean_ntt_inv() != 0){return 1;} return(0); } diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678.s index 9a89926..8228b2c 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678.s @@ -177,7 +177,7 @@ trn1_d \data\()1, t1, t3 .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -188,7 +188,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -198,7 +198,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -206,7 +206,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -217,19 +217,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4.s index e7e3c1d..153895c 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4.s @@ -177,7 +177,7 @@ trn1_d \data\()1, t1, t3 .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -188,7 +188,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -198,7 +198,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -206,7 +206,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -217,19 +217,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s new file mode 100644 index 0000000..864bab4 --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s @@ -0,0 +1,1934 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm + .global _intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm: +_intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + ldr q11, [x0, #0] // ..*..................................... + ldr q9, [x0, #48] // ...*.................................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + ldr q17, [x0, #16] // .*...................................... + ldr q2, [x0, #32] // ....*................................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + ldr q1, [x3, #32] // *....................................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn1 v28.4S, v11.4S, v17.4S // .......*................................ + trn1 v12.4S, v2.4S, v9.4S // .........*.............................. + trn2 v8.4S, v11.4S, v17.4S // ........*............................... + trn2 v23.4S, v2.4S, v9.4S // ..........*............................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn2 v17.2D, v8.2D, v23.2D // ..............*......................... + trn2 v11.2D, v28.2D, v12.2D // ...............*........................ + trn1 v15.2D, v28.2D, v12.2D // ...........*............................ + ldr q0, [x3, #80] // .....*.................................. + trn1 v20.2D, v8.2D, v23.2D // ............*........................... + ldr q16, [x3, #64] // ......*................................. + ldr q25, [x3, #48] // .............*.......................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v10.4S, v11.4S, v17.4S // ...................*.................... + sub v18.4S, v15.4S, v20.4S // ................*....................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + add v28.4S, v11.4S, v17.4S // .........................*.............. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mul v24.4S, v18.4S, v1.4S // .....................*.................. + sqrdmulh v0.4S, v10.4S, v0.4S // .......................*................ + mul v30.4S, v10.4S, v16.4S // ......................*................. + sqrdmulh v17.4S, v18.4S, v25.4S // ....................*................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + ldr q31, [x3], #(6*16) // ..................*..................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + add v7.4S, v15.4S, v20.4S // .................*...................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v30.4S, v0.4S, v29.4S // ...........................*............ + mls v24.4S, v17.4S, v29.4S // ........................*............... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + ldr q15, [x3, #-80] // ..........................*............. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v6.4S, v7.4S, v28.4S // ............................*........... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v27.4S, v24.4S, v30.4S // ...............................*........ + add v4.4S, v24.4S, v30.4S // ................................*....... + add v24.4S, v7.4S, v28.4S // ..............................*......... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mul v10.4S, v6.4S, v31.4S // .................................*...... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mul v0.4S, v27.4S, v31.4S // ..................................*..... + sqrdmulh v26.4S, v6.4S, v15.4S // .............................*.......... + sqrdmulh v16.4S, v27.4S, v15.4S // ...................................*.... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn2 v5.4S, v24.4S, v4.4S // .....................................*.. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn1 v4.4S, v24.4S, v4.4S // ....................................*... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v10.4S, v26.4S, v29.4S // ......................................*. + mls v0.4S, v16.4S, v29.4S // .......................................* + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + + // original source code + // ldr q21, [x3, #32] // ....*................................... + // ldr q20, [x0, #16] // ..*..................................... + // ldr q19, [x0, #0] // *....................................... + // ldr q7, [x0, #48] // .*...................................... + // ldr q1, [x0, #32] // ...*.................................... + // ldr q0, [x3, #80] // ............*........................... + // ldr q16, [x3, #64] // ..............*......................... + // trn1 v30.4S, v19.4S, v20.4S // .....*.................................. + // trn2 v25.4S, v19.4S, v20.4S // .......*................................ + // trn1 v3.4S, v1.4S, v7.4S // ......*................................. + // trn2 v2.4S, v1.4S, v7.4S // ........*............................... + // trn1 v26.2D, v30.2D, v3.2D // ...........*............................ + // trn1 v23.2D, v25.2D, v2.2D // .............*.......................... + // ldr q6, [x3, #48] // ...............*........................ + // trn2 v1.2D, v25.2D, v2.2D // .........*.............................. + // trn2 v24.2D, v30.2D, v3.2D // ..........*............................. + // sub v18.4S, v26.4S, v23.4S // .................*...................... + // add v25.4S, v26.4S, v23.4S // ........................*............... + // ldr q20, [x3], #(6*16) // .......................*................ + // sub v4.4S, v24.4S, v1.4S // ................*....................... + // sqrdmulh v27.4S, v18.4S, v6.4S // ......................*................. + // mul v3.4S, v18.4S, v21.4S // ...................*.................... + // mul v14.4S, v4.4S, v16.4S // .....................*.................. + // sqrdmulh v16.4S, v4.4S, v0.4S // ....................*................... + // mls v3.4S, v27.4S, v29.4S // ..........................*............. + // add v27.4S, v24.4S, v1.4S // ..................*..................... + // ldr q1, [x3, #-80] // ...........................*............ + // mls v14.4S, v16.4S, v29.4S // .........................*.............. + // sub v9.4S, v25.4S, v27.4S // ............................*........... + // sqrdmulh v16.4S, v9.4S, v1.4S // ..................................*..... + // add v30.4S, v25.4S, v27.4S // ...............................*........ + // sub v22.4S, v3.4S, v14.4S // .............................*.......... + // add v13.4S, v3.4S, v14.4S // ..............................*......... + // mul v10.4S, v9.4S, v20.4S // ................................*....... + // mul v0.4S, v22.4S, v20.4S // .................................*...... + // sqrdmulh v12.4S, v22.4S, v1.4S // ...................................*.... + // trn1 v4.4S, v30.4S, v13.4S // .....................................*.. + // trn2 v5.4S, v30.4S, v13.4S // ....................................*... + // mls v10.4S, v16.4S, v29.4S // ......................................*. + // mls v0.4S, v12.4S, v29.4S // .......................................* + + sub count, count, #1 +layer5678_start: + ldr q21, [x3, #32] // ..............e............................................................. + ldr q20, [x0, #80] // .e.......................................................................... + ldr q19, [x0, #64] // e........................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q7, [x0, #112] // ...e........................................................................ + ldr q1, [x0, #96] // ..e......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn2 v28.4S, v10.4S, v0.4S // .........................................*.................................. + trn1 v23.4S, v10.4S, v0.4S // ........................................*................................... + ldr q0, [x3, #80] // .................e.......................................................... + ldr q16, [x3, #64] // ................e........................................................... + ldr q22, [x4], #8 // ..............................................*............................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v30.4S, v19.4S, v20.4S // ....e....................................................................... + trn2 v25.4S, v19.4S, v20.4S // .....e...................................................................... + ldr q20, [x4], #16 // ...............................................*............................ + trn2 v10.2D, v5.2D, v28.2D // ...........................................*................................ + trn2 v8.2D, v4.2D, v23.2D // ..........................................*................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v3.4S, v1.4S, v7.4S // ......e..................................................................... + trn2 v2.4S, v1.4S, v7.4S // .......e.................................................................... + trn1 v7.2D, v4.2D, v23.2D // ............................................*............................... + trn1 v6.2D, v5.2D, v28.2D // .............................................*.............................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v17.4S, v8.4S, v10.4S // ......................................................*..................... + sub v18.4S, v8.4S, v10.4S // .....................................................*...................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v28.4S, v7.4S, v6.4S // ................................................*........................... + add v5.4S, v7.4S, v6.4S // .................................................*.......................... + trn1 v26.2D, v30.2D, v3.2D // ..........e................................................................. + trn1 v23.2D, v25.2D, v2.2D // ...........e................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q6, [x3, #48] // ...............e............................................................ + trn2 v1.2D, v25.2D, v2.2D // .........e.................................................................. + mul v31.4S, v18.4S, v20.S[2] // .......................................................*.................... + sqrdmulh v7.4S, v18.4S, v20.S[3] // ........................................................*................... + trn2 v24.2D, v30.2D, v3.2D // ........e................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v18.4S, v26.4S, v23.4S // ..................e......................................................... + add v25.4S, v26.4S, v23.4S // ...................e........................................................ + mul v2.4S, v28.4S, v20.S[0] // ..................................................*......................... + sqrdmulh v23.4S, v28.4S, v20.S[1] // ...................................................*........................ + ldr q20, [x3], #(6*16) // ............e............................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v4.4S, v24.4S, v1.4S // .......................e.................................................... + sub v8.4S, v5.4S, v17.4S // ..........................................................*................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v10.4S, v5.4S, v17.4S // ...........................................................*................ + mls v31.4S, v7.4S, v29.4S // .........................................................*.................. + sqrdmulh v27.4S, v18.4S, v6.4S // .....................e...................................................... + mul v3.4S, v18.4S, v21.4S // ....................e....................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v14.4S, v4.4S, v16.4S // .........................e.................................................. + sqrdmulh v16.4S, v4.4S, v0.4S // ..........................e................................................. + mls v2.4S, v23.4S, v29.4S // ....................................................*....................... + mul v17.4S, v8.4S, v22.S[0] // ............................................................*............... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v12.4S, v8.4S, v22.S[1] // .............................................................*.............. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + srshr v30.4S, v10.4S, #23 // ....................................................................*....... + mls v3.4S, v27.4S, v29.4S // ......................e..................................................... + add v27.4S, v24.4S, v1.4S // ........................e................................................... + ldr q1, [x3, #-80] // .............e.............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v4.4S, v2.4S, v31.4S // ...............................................................*............ + add v6.4S, v2.4S, v31.4S // ................................................................*........... + mls v14.4S, v16.4S, v29.4S // ...........................e................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v17.4S, v12.4S, v29.4S // ..............................................................*............. + sub v9.4S, v25.4S, v27.4S // ............................e............................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v10.4S, v30.4S, v29.4S // .....................................................................*...... + mul v19.4S, v4.4S, v22.S[0] // .................................................................*.......... + sqrdmulh v4.4S, v4.4S, v22.S[1] // ..................................................................*......... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + srshr v7.4S, v6.4S, #23 // ......................................................................*..... + sqrdmulh v16.4S, v9.4S, v1.4S // ...............................e............................................ + add v30.4S, v25.4S, v27.4S // .............................e.............................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v22.4S, v3.4S, v14.4S // .................................e.......................................... + add v13.4S, v3.4S, v14.4S // ..................................e......................................... + str q10, [x0], #(16*4) // ........................................................................*... + mul v10.4S, v9.4S, v20.4S // ..............................e............................................. + str q17, [x0, #-32] // ..........................................................................*. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v0.4S, v22.4S, v20.4S // ...................................e........................................ + mls v6.4S, v7.4S, v29.4S // .......................................................................*.... + sqrdmulh v12.4S, v22.4S, v1.4S // ....................................e....................................... + mls v19.4S, v4.4S, v29.4S // ...................................................................*........ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v4.4S, v30.4S, v13.4S // ......................................e..................................... + trn2 v5.4S, v30.4S, v13.4S // .......................................e.................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v10.4S, v16.4S, v29.4S // ................................e........................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v0.4S, v12.4S, v29.4S // .....................................e...................................... + str q6, [x0, #-48] // .........................................................................*.. + str q19, [x0, #-16] // ...........................................................................* + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + + // original source code + // ldr q8, [x0, #(16*0)] // ..e.........................................................................|.e......................................................................... + // ldr q9, [x0, #(16*1)] // .e..........................................................................|e.......................................................................... + // ldr q10, [x0, #(16*2)] // ....e.......................................................................|...e....................................................................... + // ldr q11, [x0, #(16*3)] // ...e........................................................................|..e........................................................................ + // trn1 v25.4s, v8.4s, v9.4s // ..........e.................................................................|.........e................................................................. + // trn2 v26.4s, v8.4s, v9.4s // ...........e................................................................|..........e................................................................ + // trn1 v27.4s, v10.4s, v11.4s // ...............e............................................................|..............e............................................................ + // trn2 v28.4s, v10.4s, v11.4s // ................e...........................................................|...............e........................................................... + // trn2 v10.2d, v25.2d, v27.2d // .............................e..............................................|............................e.............................................. + // trn2 v11.2d, v26.2d, v28.2d // ..........................e.................................................|.........................e................................................. + // trn1 v8.2d, v25.2d, v27.2d // .......................e....................................................|......................e.................................................... + // trn1 v9.2d, v26.2d, v28.2d // ........................e...................................................|.......................e................................................... + // ldr q0, [x3], #(6*16) // ..................................e.........................................|.................................e......................................... + // ldr q4, [x3, #(-6*16 + 1*16)] // .................................................e..........................|................................................e.......................... + // ldr q1, [x3, #(-6*16 + 2*16)] // e...........................................................................e........................................................................... + // ldr q5, [x3, #(-6*16 + 3*16)] // .........................e..................................................|........................e.................................................. + // ldr q2, [x3, #(-6*16 + 4*16)] // ........e...................................................................|.......e................................................................... + // ldr q6, [x3, #(-6*16 + 5*16)] // .......e....................................................................|......e.................................................................... + // sub v24.4s, v8.4s, v9.4s // ..............................e.............................................|.............................e............................................. + // add v8.4s, v8.4s, v9.4s // ...............................e............................................|..............................e............................................ + // mul v9.4s, v24.4s, v1.4s // ........................................e...................................|.......................................e................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .......................................e....................................|......................................e.................................... + // mls v9.4s, v24.4s, v29.4s // ...............................................e............................|..............................................e............................ + // sub v24.4s, v10.4s, v11.4s // ...................................e........................................|..................................e........................................ + // add v10.4s, v10.4s, v11.4s // ................................................e...........................|...............................................e........................... + // mul v11.4s, v24.4s, v2.4s // .........................................e..................................|........................................e.................................. + // sqrdmulh v24.4s, v24.4s, v6.4s // ..........................................e.................................|.........................................e................................. + // mls v11.4s, v24.4s, v29.4s // ....................................................e.......................|...................................................e....................... + // sub v24.4s, v8.4s, v10.4s // ......................................................e.....................|.....................................................e..................... + // add v8.4s, v8.4s, v10.4s // ............................................................e...............|...........................................................e............... + // mul v10.4s, v24.4s, v0.4s // ................................................................e...........|...............................................................e........... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...........................................................e................|..........................................................e................ + // mls v10.4s, v24.4s, v29.4s // ........................................................................e...|.......................................................................e... + // sub v24.4s, v9.4s, v11.4s // .............................................................e..............|............................................................e.............. + // add v9.4s, v9.4s, v11.4s // ..............................................................e.............|.............................................................e............. + // mul v11.4s, v24.4s, v0.4s // ..................................................................e.........|.................................................................e......... + // sqrdmulh v24.4s, v24.4s, v4.4s // ....................................................................e.......|...................................................................e....... + // mls v11.4s, v24.4s, v29.4s // .........................................................................e..|........................................................................e.. + // trn1 v25.4s, v8.4s, v9.4s // ......................................................................e.....|.....................................................................e..... + // trn2 v26.4s, v8.4s, v9.4s // .......................................................................e....|......................................................................e.... + // trn1 v27.4s, v10.4s, v11.4s // ......*.....................................................................|.....*..................................................................... + // trn2 v28.4s, v10.4s, v11.4s // .....*......................................................................|....*...................................................................... + // trn2 v10.2d, v25.2d, v27.2d // ..............*.............................................................|.............*............................................................. + // trn2 v11.2d, v26.2d, v28.2d // .............*..............................................................|............*.............................................................. + // trn1 v8.2d, v25.2d, v27.2d // .................*..........................................................|................*.......................................................... + // trn1 v9.2d, v26.2d, v28.2d // ..................*.........................................................|.................*......................................................... + // ldr q1, [x4], #8 // .........*..................................................................|........*.................................................................. + // ldr q0, [x4], #16 // ............*...............................................................|...........*............................................................... + // sub v24.4s, v8.4s, v9.4s // .....................*......................................................|....................*...................................................... + // add v8.4s, v8.4s, v9.4s // ......................*.....................................................|.....................*..................................................... + // mul v9.4s, v24.4s, v0.s[0] // ................................*...........................................|...............................*........................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................*..........................................|................................*.......................................... + // mls v9.4s, v24.4s, v29.4s // ...........................................*................................|..........................................*................................ + // sub v24.4s, v10.4s, v11.4s // ....................*.......................................................|...................*....................................................... + // add v10.4s, v10.4s, v11.4s // ...................*........................................................|..................*........................................................ + // mul v11.4s, v24.4s, v0.s[2] // ...........................*................................................|..........................*................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ............................*...............................................|...........................*............................................... + // mls v11.4s, v24.4s, v29.4s // ......................................*.....................................|.....................................*..................................... + // sub v24.4s, v8.4s, v10.4s // ....................................*.......................................|...................................*....................................... + // add v8.4s, v8.4s, v10.4s // .....................................*......................................|....................................*...................................... + // mul v10.4s, v24.4s, v1.s[0] // ............................................*...............................|...........................................*............................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................................*..............................|............................................*.............................. + // mls v10.4s, v24.4s, v29.4s // .....................................................*......................|....................................................*...................... + // sub v24.4s, v9.4s, v11.4s // ..................................................*.........................|.................................................*......................... + // add v9.4s, v9.4s, v11.4s // ...................................................*........................|..................................................*........................ + // mul v11.4s, v24.4s, v1.s[0] // ........................................................*...................|.......................................................*................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................................*..................|........................................................*.................. + // mls v11.4s, v24.4s, v29.4s // .....................................................................*......|....................................................................*...... + // srshr v24.4S, v8.4S, #23 // ..............................................*.............................|.............................................*............................. + // mls v8.4s, v24.4s, v29.4s // .......................................................*....................|......................................................*.................... + // srshr v24.4S, v9.4S, #23 // ..........................................................*.................|.........................................................*................. + // mls v9.4s, v24.4s, v29.4s // ...................................................................*........|..................................................................*........ + // str q8, [x0], #(16*4) // ...............................................................*............|..............................................................*............ + // str q9, [x0, #(-16*4 + 1*16)] // ..........................................................................*.|.........................................................................*. + // str q10, [x0, #(-16*4 + 2*16)] // .................................................................*..........|................................................................*.......... + // str q11, [x0, #(-16*4 + 3*16)] // ...........................................................................*|..........................................................................* + + sub count, count, #1 + cbnz count, layer5678_start + trn1 v30.4S, v10.4S, v0.4S // .*.................................. + trn2 v10.4S, v10.4S, v0.4S // *................................... + ldr q14, [x4], #8 // ..*................................. + ldr q19, [x4], #16 // ...*................................ + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + trn2 v1.2D, v4.2D, v30.2D // .....*.............................. + trn1 v30.2D, v4.2D, v30.2D // ......*............................. + trn2 v24.2D, v5.2D, v10.2D // ....*............................... + trn1 v4.2D, v5.2D, v10.2D // .......*............................ + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sub v10.4S, v1.4S, v24.4S // .........*.......................... + sub v3.4S, v30.4S, v4.4S // ..........*......................... + add v30.4S, v30.4S, v4.4S // ...........*........................ + add v17.4S, v1.4S, v24.4S // ........*........................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mul v27.4S, v10.4S, v19.S[2] // ............*....................... + sqrdmulh v7.4S, v10.4S, v19.S[3] // .............*...................... + mul v0.4S, v3.4S, v19.S[0] // ..............*..................... + sqrdmulh v24.4S, v3.4S, v19.S[1] // ...............*.................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sub v4.4S, v30.4S, v17.4S // ................*................... + add v16.4S, v30.4S, v17.4S // .................*.................. + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v27.4S, v7.4S, v29.4S // ..................*................. + mls v0.4S, v24.4S, v29.4S // ...................*................ + srshr v26.4S, v16.4S, #23 // ......................*............. + sqrdmulh v30.4S, v4.4S, v14.S[1] // .....................*.............. + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mul v6.4S, v4.4S, v14.S[0] // ....................*............... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sub v4.4S, v0.4S, v27.4S // .......................*............ + mls v16.4S, v26.4S, v29.4S // ..........................*......... + add v24.4S, v0.4S, v27.4S // ........................*........... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v6.4S, v30.4S, v29.4S // .........................*.......... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mul v10.4S, v4.4S, v14.S[0] // ...........................*........ + sqrdmulh v30.4S, v4.4S, v14.S[1] // ............................*....... + srshr v28.4S, v24.4S, #23 // .............................*...... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + str q16, [x0], #(16*4) // ..............................*..... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + str q6, [x0, #-32] // ...............................*.... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v24.4S, v28.4S, v29.4S // ................................*... + mls v10.4S, v30.4S, v29.4S // .................................*.. + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + str q24, [x0, #-48] // ..................................*. + str q10, [x0, #-16] // ...................................* + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + + // original source code + // trn2 v28.4S, v10.4S, v0.4S // .*.................................. + // trn1 v23.4S, v10.4S, v0.4S // *................................... + // ldr q22, [x4], #8 // ..*................................. + // ldr q20, [x4], #16 // ...*................................ + // trn2 v10.2D, v5.2D, v28.2D // ......*............................. + // trn2 v8.2D, v4.2D, v23.2D // ....*............................... + // trn1 v7.2D, v4.2D, v23.2D // .....*.............................. + // trn1 v6.2D, v5.2D, v28.2D // .......*............................ + // add v17.4S, v8.4S, v10.4S // ...........*........................ + // sub v18.4S, v8.4S, v10.4S // ........*........................... + // sub v28.4S, v7.4S, v6.4S // .........*.......................... + // add v5.4S, v7.4S, v6.4S // ..........*......................... + // mul v31.4S, v18.4S, v20.S[2] // ............*....................... + // sqrdmulh v7.4S, v18.4S, v20.S[3] // .............*...................... + // mul v2.4S, v28.4S, v20.S[0] // ..............*..................... + // sqrdmulh v23.4S, v28.4S, v20.S[1] // ...............*.................... + // sub v8.4S, v5.4S, v17.4S // ................*................... + // add v10.4S, v5.4S, v17.4S // .................*.................. + // mls v31.4S, v7.4S, v29.4S // ..................*................. + // mls v2.4S, v23.4S, v29.4S // ...................*................ + // mul v17.4S, v8.4S, v22.S[0] // ......................*............. + // sqrdmulh v12.4S, v8.4S, v22.S[1] // .....................*.............. + // srshr v30.4S, v10.4S, #23 // ....................*............... + // sub v4.4S, v2.4S, v31.4S // .......................*............ + // add v6.4S, v2.4S, v31.4S // .........................*.......... + // mls v17.4S, v12.4S, v29.4S // ..........................*......... + // mls v10.4S, v30.4S, v29.4S // ........................*........... + // mul v19.4S, v4.4S, v22.S[0] // ...........................*........ + // sqrdmulh v4.4S, v4.4S, v22.S[1] // ............................*....... + // srshr v7.4S, v6.4S, #23 // .............................*...... + // str q10, [x0], #(16*4) // ..............................*..... + // str q17, [x0, #-32] // ...............................*.... + // mls v6.4S, v7.4S, v29.4S // ................................*... + // mls v19.4S, v4.4S, v29.4S // .................................*.. + // str q6, [x0, #-48] // ..................................*. + // str q19, [x0, #-16] // ...................................* + + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 + ldr q17, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + ldr q12, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + ldr q27, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q16, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + ldr q18, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + ldr q9, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + ldr q14, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + ldr q11, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + ldr q22, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + ldr q15, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + ldr q28, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + sub v8.4S, v27.4S, v12.4S // ...............................*........................................................................................................................................................................................................................................................ + ldr q19, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + add v23.4S, v27.4S, v12.4S // ................................*....................................................................................................................................................................................................................................................... + sub v27.4S, v17.4S, v18.4S // ................*....................................................................................................................................................................................................................................................................... + sub v21.4S, v16.4S, v9.4S // ....................................*................................................................................................................................................................................................................................................... + add v16.4S, v16.4S, v9.4S // .....................................*.................................................................................................................................................................................................................................................. + sub v12.4S, v11.4S, v14.4S // ..........................*............................................................................................................................................................................................................................................................. + add v20.4S, v11.4S, v14.4S // ...........................*............................................................................................................................................................................................................................................................ + mul v11.4S, v8.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... + sqrdmulh v10.4S, v8.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... + mul v9.4S, v27.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v27.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... + sub v27.4S, v28.4S, v15.4S // .....................*.................................................................................................................................................................................................................................................................. + sqrdmulh v14.4S, v21.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + add v8.4S, v28.4S, v15.4S // ......................*................................................................................................................................................................................................................................................................. + mul v21.4S, v21.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. + sub v28.4S, v22.4S, v19.4S // .........................................*.............................................................................................................................................................................................................................................. + add v19.4S, v22.4S, v19.4S // ..........................................*............................................................................................................................................................................................................................................. + sqrdmulh v15.4S, v27.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... + mul v22.4S, v27.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ + ldr q13, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + mls v21.4S, v14.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + ldr q14, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + mls v11.4S, v10.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + ldr q10, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + add v27.4S, v20.4S, v23.4S // ...................................................................*.................................................................................................................................................................................................................... + mls v9.4S, v24.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + sub v23.4S, v20.4S, v23.4S // ..................................................................*..................................................................................................................................................................................................................... + ldr q20, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + add v24.4S, v17.4S, v18.4S // .................*...................................................................................................................................................................................................................................................................... + mul v18.4S, v28.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ + sqrdmulh v17.4S, v28.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... + add v28.4S, v16.4S, v19.4S // .............................................................................*.......................................................................................................................................................................................................... + mls v22.4S, v15.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + mul v15.4S, v12.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... + sub v19.4S, v16.4S, v19.4S // ............................................................................*........................................................................................................................................................................................................... + sqrdmulh v12.4S, v12.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... + sub v16.4S, v14.4S, v13.4S // ..............................................*......................................................................................................................................................................................................................................... + add v13.4S, v14.4S, v13.4S // ...............................................*........................................................................................................................................................................................................................................ + mul v14.4S, v23.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... + sqrdmulh v23.4S, v23.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. + mls v18.4S, v17.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + sub v17.4S, v10.4S, v20.4S // ...................................................*.................................................................................................................................................................................................................................... + add v20.4S, v10.4S, v20.4S // ....................................................*................................................................................................................................................................................................................................... + sub v10.4S, v24.4S, v8.4S // ........................................................*............................................................................................................................................................................................................................... + mls v14.4S, v23.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + add v23.4S, v13.4S, v20.4S // .......................................................................................*................................................................................................................................................................................................ + sub v20.4S, v13.4S, v20.4S // ......................................................................................*................................................................................................................................................................................................. + mls v15.4S, v12.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + add v12.4S, v9.4S, v22.4S // ..............................................................*......................................................................................................................................................................................................................... + sub v13.4S, v9.4S, v22.4S // .............................................................*.......................................................................................................................................................................................................................... + add v24.4S, v24.4S, v8.4S // .........................................................*.............................................................................................................................................................................................................................. + mul v22.4S, v17.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. + sqrdmulh v8.4S, v17.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. + add v17.4S, v15.4S, v11.4S // ........................................................................*............................................................................................................................................................................................................... + sub v11.4S, v15.4S, v11.4S // .......................................................................*................................................................................................................................................................................................................ + sqrdmulh v15.4S, v19.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + mul v19.4S, v19.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... + sub v9.4S, v24.4S, v27.4S // ................................................................................................*....................................................................................................................................................................................... + add v24.4S, v24.4S, v27.4S // .................................................................................................*...................................................................................................................................................................................... + mul v27.4S, v20.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... + mls v22.4S, v8.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + sqrdmulh v8.4S, v20.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. + sub v20.4S, v21.4S, v18.4S // .................................................................................*...................................................................................................................................................................................................... + add v21.4S, v21.4S, v18.4S // ..................................................................................*..................................................................................................................................................................................................... + mls v19.4S, v15.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + mul v18.4S, v16.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... + sqrdmulh v15.4S, v16.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... + add v16.4S, v28.4S, v23.4S // .....................................................................................................................*.................................................................................................................................................................. + sub v28.4S, v28.4S, v23.4S // ....................................................................................................................*................................................................................................................................................................... + mul v23.4S, v10.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. + mls v27.4S, v8.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + sqrdmulh v8.4S, v10.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ + mls v18.4S, v15.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + sub v15.4S, v19.4S, v27.4S // ..............................................................................................................................*......................................................................................................................................................... + mls v23.4S, v8.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + add v27.4S, v19.4S, v27.4S // ...............................................................................................................................*........................................................................................................................................................ + add v8.4S, v24.4S, v16.4S // .........................................................................................................................................*.............................................................................................................................................. + sub v16.4S, v24.4S, v16.4S // ........................................................................................................................................*............................................................................................................................................... + sqrdmulh v24.4S, v13.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... + mul v13.4S, v13.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ + mul v19.4S, v11.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. + add v10.4S, v23.4S, v14.4S // ...........................................................................................................*............................................................................................................................................................................ + sub v14.4S, v23.4S, v14.4S // ..........................................................................................................*............................................................................................................................................................................. + sqrdmulh v23.4S, v11.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. + sub v11.4S, v18.4S, v22.4S // ...........................................................................................*............................................................................................................................................................................................ + add v18.4S, v18.4S, v22.4S // ............................................................................................*........................................................................................................................................................................................... + mls v13.4S, v24.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + mul v24.4S, v14.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + sqrdmulh v14.4S, v14.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + mls v19.4S, v23.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + sqrdmulh v22.4S, v16.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + mul v16.4S, v16.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + mls v24.4S, v14.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + mul v23.4S, v11.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... + sqrdmulh v14.4S, v11.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... + add v11.4S, v13.4S, v19.4S // ................................................................................................................*....................................................................................................................................................................... + sub v13.4S, v13.4S, v19.4S // ...............................................................................................................*........................................................................................................................................................................ + sub v19.4S, v12.4S, v17.4S // .....................................................................................................*.................................................................................................................................................................................. + mls v16.4S, v22.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + add v22.4S, v12.4S, v17.4S // ......................................................................................................*................................................................................................................................................................................. + sqrdmulh v17.4S, v9.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... + sub v12.4S, v21.4S, v18.4S // .........................................................................................................................*.............................................................................................................................................................. + add v18.4S, v21.4S, v18.4S // ..........................................................................................................................*............................................................................................................................................................. + mul v9.4S, v9.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... + sqrdmulh v21.4S, v20.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + mul v20.4S, v20.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + mls v23.4S, v14.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + mul v14.4S, v15.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + sqrdmulh v15.4S, v15.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + mls v9.4S, v17.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + sqrdmulh v17.4S, v12.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + mls v20.4S, v21.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + mul v21.4S, v12.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ + mul v12.4S, v28.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. + sqrdmulh v28.4S, v28.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ + mls v14.4S, v15.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + mul v15.4S, v19.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ + sqrdmulh v19.4S, v19.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... + mls v21.4S, v17.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + mls v12.4S, v28.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + cmge v17.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + cmge v28.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + mls v15.4S, v19.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + sub v19.4S, v10.4S, v27.4S // ..................................................................................................................................................*..................................................................................................................................... + add v10.4S, v10.4S, v27.4S // ...................................................................................................................................................*.................................................................................................................................... + add v27.4S, v20.4S, v23.4S // ....................................................................................................................................*................................................................................................................................................... + sub v23.4S, v20.4S, v23.4S // ...................................................................................................................................*.................................................................................................................................................... + sub v17.4S, v28.4S, v17.4S // ..................................................................................................................................................................................*..................................................................................................... + sub count, count, #1 +layer1234_start: + mls v16.4S, v17.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sub v17.4S, v22.4S, v18.4S // .............................................................................................................................................*.......................................................................................................................................... + sub v20.4S, v9.4S, v12.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v9.4S, v12.4S // .............................................................................................................................................................*.......................................................................................................................... + add v9.4S, v22.4S, v18.4S // ..............................................................................................................................................*......................................................................................................................................... + mul v28.4S, v13.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + sqrdmulh v22.4S, v13.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + sub v18.4S, v15.4S, v21.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v15.4S, v21.4S // ..................................................................................................................................................................*..................................................................................................................... + mul v21.4S, v20.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + sqrdmulh v20.4S, v20.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + sub v15.4S, v24.4S, v14.4S // ......................................................................................................................................................................*................................................................................................................. + add v14.4S, v24.4S, v14.4S // .......................................................................................................................................................................*................................................................................................................ + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + mul v24.4S, v23.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + sqrdmulh v16.4S, v18.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + mls v28.4S, v22.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + sqrdmulh v22.4S, v23.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + sub v23.4S, v11.4S, v27.4S // .......................................................................................................................................................*................................................................................................................................ + add v11.4S, v11.4S, v27.4S // ........................................................................................................................................................*............................................................................................................................... + mls v21.4S, v20.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + mul v27.4S, v18.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + mul v18.4S, v19.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v20.4S, v19.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + sqrdmulh v19.4S, v17.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + mul v17.4S, v17.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + mls v24.4S, v22.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mul v22.4S, v15.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + mls v27.4S, v16.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + mls v17.4S, v19.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + cmge v19.4S, v31.4S, v21.4S // ................................................................................................................................................................................................*....................................................................................... + cmge v16.4S, v21.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v18.4S, v20.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + sqrdmulh v20.4S, v15.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + add v15.4S, v28.4S, v24.4S // ............................................................................................................................................................................*........................................................................................................... + sub v24.4S, v28.4S, v24.4S // ...........................................................................................................................................................................*............................................................................................................ + sub v28.4S, v19.4S, v16.4S // ..................................................................................................................................................................................................*..................................................................................... + mul v19.4S, v23.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v16.4S, v23.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mul v23.4S, v24.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sqrdmulh v24.4S, v24.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + mls v21.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + cmge v28.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + mls v22.4S, v20.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + cmge v20.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + mls v19.4S, v16.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sqrdmulh v8.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + mls v23.4S, v24.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + sub v20.4S, v20.4S, v28.4S // ..........................................................................................................................................................................................*............................................................................................. + cmge v28.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + cmge v24.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + str q21, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + mls v16.4S, v8.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sub v21.4S, v28.4S, v24.4S // ......................................................................................................................................................................................*................................................................................................. + cmge v28.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + mls v18.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + cmge v20.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + cmge v8.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + cmge v24.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... + mls v17.4S, v21.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + sub v28.4S, v24.4S, v28.4S // ..............................................................................................................................................................................................*......................................................................................... + sqrdmulh v24.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + sub v13.4S, v20.4S, v8.4S // ..............................................................................................................................................................................................................*......................................................................... + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + sqrdmulh v12.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + cmge v8.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v18.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... + mls v23.4S, v13.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + sqrdmulh v13.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + mls v19.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + sqrdmulh v9.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mls v21.4S, v24.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + sub v24.4S, v18.4S, v8.4S // ..........................................................................................................................................................................................................*............................................................................. + mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + cmge v10.4S, v27.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + sqrdmulh v8.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + sqrdmulh v28.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + mls v20.4S, v12.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + mls v22.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + cmge v15.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + cmge v12.4S, v31.4S, v27.4S // ....................................................................................................................................................................................................*................................................................................... + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + cmge v24.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + cmge v11.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + sub v10.4S, v12.4S, v10.4S // ......................................................................................................................................................................................................*................................................................................. + cmge v12.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + mls v18.4S, v13.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + sqrdmulh v13.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + sub v11.4S, v11.4S, v15.4S // ..................................................................................................................................................................................................................................................*..................................... + cmge v15.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + mls v17.4S, v9.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + sub v14.4S, v12.4S, v24.4S // ..................................................................................................................................................................................................................................................................*..................... + mls v27.4S, v10.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + mls v19.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + cmge v28.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + cmge v9.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + mls v16.4S, v11.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + cmge v8.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... + cmge v10.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v12.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... + mls v22.4S, v13.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + sub v13.4S, v9.4S, v15.4S // ......................................................................................................................................................................................................................................................................*................. + sub v24.4S, v8.4S, v28.4S // ..........................................................................................................................................................................................................................................................*............................. + str q27, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + ldr q9, [x1, #256] // ....e................................................................................................................................................................................................................................................................................... + cmge v8.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + ldr q16, [x1, #320] // .....e.................................................................................................................................................................................................................................................................................. + mls v20.4S, v14.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + sub v14.4S, v12.4S, v10.4S // ..............................................................................................................................................................................................................................................................................*......... + ldr q10, [x1, #384] // ......e................................................................................................................................................................................................................................................................................. + cmge v15.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + ldr q12, [x1, #448] // .......e................................................................................................................................................................................................................................................................................ + cmge v28.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + ldr q11, [x1, #512] // ........e............................................................................................................................................................................................................................................................................... + mls v18.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + sub v24.4S, v27.4S, v8.4S // ......................................................................................................................................................................................................................................................*................................. + cmge v8.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v27.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + mls v23.4S, v14.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + ldr q14, [x1, #576] // .........e.............................................................................................................................................................................................................................................................................. + mls v21.4S, v13.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + mls v17.4S, v24.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + add v24.4S, v9.4S, v16.4S // ...........................e............................................................................................................................................................................................................................................................ + sub v13.4S, v9.4S, v16.4S // ..........................e............................................................................................................................................................................................................................................................. + sub v16.4S, v27.4S, v28.4S // ..............................................................................................................................................................................................................................................................*......................... + str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + sub v27.4S, v10.4S, v12.4S // ...............................e........................................................................................................................................................................................................................................................ + sub v15.4S, v8.4S, v15.4S // ..........................................................................................................................................................................................................................................................................*............. + sub v18.4S, v11.4S, v14.4S // ....................................e................................................................................................................................................................................................................................................... + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + add v20.4S, v11.4S, v14.4S // .....................................e.................................................................................................................................................................................................................................................. + add v8.4S, v10.4S, v12.4S // ................................e....................................................................................................................................................................................................................................................... + ldr q10, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ + ldr q14, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... + ldr q28, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. + ldr q12, [x1, #64] // .e...................................................................................................................................................................................................................................................................................... + mls v19.4S, v16.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + sqrdmulh v9.4S, v27.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... + str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + ldr q17, [x1, #0] // e....................................................................................................................................................................................................................................................................................... + ldr q16, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... + mul v11.4S, v27.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... + str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + add v27.4S, v24.4S, v8.4S // ...................................................................e.................................................................................................................................................................................................................... + mul v21.4S, v18.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. + str q23, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + sub v23.4S, v24.4S, v8.4S // ..................................................................e..................................................................................................................................................................................................................... + sqrdmulh v8.4S, v18.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ + mul v18.4S, v13.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v13.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... + mls v22.4S, v15.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + add v15.4S, v28.4S, v10.4S // ..........................................e............................................................................................................................................................................................................................................. + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + sub v19.4S, v28.4S, v10.4S // .........................................e.............................................................................................................................................................................................................................................. + mul v10.4S, v23.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... + add v13.4S, v16.4S, v14.4S // ......................e................................................................................................................................................................................................................................................................. + mls v11.4S, v9.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... + sqrdmulh v23.4S, v23.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. + sub v9.4S, v16.4S, v14.4S // .....................e.................................................................................................................................................................................................................................................................. + ldr q28, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + mul v14.4S, v19.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ + mls v18.4S, v24.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... + mls v21.4S, v8.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... + sqrdmulh v16.4S, v19.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... + ldr q24, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... + mul v19.4S, v9.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ + sqrdmulh v22.4S, v9.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... + add v9.4S, v17.4S, v12.4S // .................e...................................................................................................................................................................................................................................................................... + sub v17.4S, v17.4S, v12.4S // ................e....................................................................................................................................................................................................................................................................... + ldr q12, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ + ldr q8, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... + mls v10.4S, v23.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. + sub v23.4S, v20.4S, v15.4S // ............................................................................e........................................................................................................................................................................................................... + mls v14.4S, v16.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... + add v16.4S, v20.4S, v15.4S // .............................................................................e.......................................................................................................................................................................................................... + add v20.4S, v9.4S, v13.4S // .........................................................e.............................................................................................................................................................................................................................. + mls v19.4S, v22.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. + add v22.4S, v24.4S, v28.4S // ...............................................e........................................................................................................................................................................................................................................ + sub v15.4S, v24.4S, v28.4S // ..............................................e......................................................................................................................................................................................................................................... + sub v24.4S, v9.4S, v13.4S // ........................................................e............................................................................................................................................................................................................................... + add v13.4S, v8.4S, v12.4S // ....................................................e................................................................................................................................................................................................................................... + sub v8.4S, v8.4S, v12.4S // ...................................................e.................................................................................................................................................................................................................................... + sub v28.4S, v22.4S, v13.4S // ......................................................................................e................................................................................................................................................................................................. + add v13.4S, v22.4S, v13.4S // .......................................................................................e................................................................................................................................................................................................ + mul v22.4S, v17.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... + sqrdmulh v17.4S, v17.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... + mul v12.4S, v15.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... + sqrdmulh v15.4S, v15.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... + sub v9.4S, v20.4S, v27.4S // ................................................................................................e....................................................................................................................................................................................... + add v27.4S, v20.4S, v27.4S // .................................................................................................e...................................................................................................................................................................................... + sub v20.4S, v16.4S, v13.4S // ....................................................................................................................e................................................................................................................................................................... + add v16.4S, v16.4S, v13.4S // .....................................................................................................................e.................................................................................................................................................................. + mul v13.4S, v24.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. + sqrdmulh v24.4S, v24.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ + mls v22.4S, v17.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... + add v17.4S, v18.4S, v11.4S // ........................................................................e............................................................................................................................................................................................................... + mls v12.4S, v15.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... + sub v18.4S, v18.4S, v11.4S // .......................................................................e................................................................................................................................................................................................................ + mul v15.4S, v8.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. + sqrdmulh v11.4S, v8.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. + sqrdmulh v8.4S, v28.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. + mls v13.4S, v24.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... + add v24.4S, v22.4S, v19.4S // ..............................................................e......................................................................................................................................................................................................................... + sub v19.4S, v22.4S, v19.4S // .............................................................e.......................................................................................................................................................................................................................... + mul v22.4S, v28.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... + mls v15.4S, v11.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ + mul v11.4S, v19.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ + sqrdmulh v28.4S, v19.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... + sqrdmulh v19.4S, v23.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ + mul v23.4S, v23.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... + mls v11.4S, v28.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... + mls v23.4S, v19.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... + add v28.4S, v12.4S, v15.4S // ............................................................................................e........................................................................................................................................................................................... + sub v15.4S, v12.4S, v15.4S // ...........................................................................................e............................................................................................................................................................................................ + sub v12.4S, v27.4S, v16.4S // ........................................................................................................................................e............................................................................................................................................... + mls v22.4S, v8.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. + add v8.4S, v27.4S, v16.4S // .........................................................................................................................................e.............................................................................................................................................. + add v19.4S, v13.4S, v10.4S // ...........................................................................................................e............................................................................................................................................................................ + sub v13.4S, v13.4S, v10.4S // ..........................................................................................................e............................................................................................................................................................................. + mul v16.4S, v12.4S, v0.S[0] // ..........................................................................................................................................e............................................................................................................................................. + sqrdmulh v12.4S, v12.4S, v0.S[1] // ...........................................................................................................................................e............................................................................................................................................ + sub v10.4S, v24.4S, v17.4S // .....................................................................................................e.................................................................................................................................................................................. + add v27.4S, v23.4S, v22.4S // ...............................................................................................................................e........................................................................................................................................................ + sub v23.4S, v23.4S, v22.4S // ..............................................................................................................................e......................................................................................................................................................... + add v22.4S, v24.4S, v17.4S // ......................................................................................................e................................................................................................................................................................................. + mul v24.4S, v13.4S, v0.S[2] // ............................................................................................................e........................................................................................................................................................................... + sqrdmulh v17.4S, v18.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. + mul v18.4S, v18.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. + sqrdmulh v13.4S, v13.4S, v0.S[3] // .............................................................................................................e.......................................................................................................................................................................... + mls v16.4S, v12.4S, v29.4S // ............................................................................................................................................e........................................................................................................................................... + mul v12.4S, v20.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. + sqrdmulh v20.4S, v20.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ + mls v24.4S, v13.4S, v29.4S // ..............................................................................................................e......................................................................................................................................................................... + mls v18.4S, v17.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ + sqrdmulh v13.4S, v9.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... + mul v9.4S, v9.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... + sub v17.4S, v21.4S, v14.4S // .................................................................................e...................................................................................................................................................................................................... + mls v12.4S, v20.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... + mul v20.4S, v15.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... + sqrdmulh v15.4S, v15.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... + add v21.4S, v21.4S, v14.4S // ..................................................................................e..................................................................................................................................................................................................... + mul v14.4S, v23.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... + sqrdmulh v23.4S, v23.4S, v1.S[1] // .................................................................................................................................e...................................................................................................................................................... + mls v9.4S, v13.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... + sub v13.4S, v11.4S, v18.4S // ...............................................................................................................e........................................................................................................................................................................ + add v11.4S, v11.4S, v18.4S // ................................................................................................................e....................................................................................................................................................................... + add v18.4S, v21.4S, v28.4S // ..........................................................................................................................e............................................................................................................................................................. + sub v21.4S, v21.4S, v28.4S // .........................................................................................................................e.............................................................................................................................................................. + mul v28.4S, v17.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... + sqrdmulh v17.4S, v17.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... + mls v20.4S, v15.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ + mul v15.4S, v10.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ + sqrdmulh v10.4S, v10.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... + mls v14.4S, v23.4S, v29.4S // ..................................................................................................................................e..................................................................................................................................................... + mls v28.4S, v17.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. + sqrdmulh v23.4S, v21.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... + mul v21.4S, v21.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + cmge v17.4S, v31.4S, v16.4S // ................................................................................................................................................................................e....................................................................................................... + mls v15.4S, v10.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. + add v10.4S, v19.4S, v27.4S // ...................................................................................................................................................e.................................................................................................................................... + sub v19.4S, v19.4S, v27.4S // ..................................................................................................................................................e..................................................................................................................................... + cmge v27.4S, v16.4S, v30.4S // .................................................................................................................................................................................e...................................................................................................... + mls v21.4S, v23.4S, v29.4S // .............................................................................................................................e.......................................................................................................................................................... + sub v17.4S, v17.4S, v27.4S // ..................................................................................................................................................................................e..................................................................................................... + add v27.4S, v28.4S, v20.4S // ....................................................................................................................................e................................................................................................................................................... + sub v23.4S, v28.4S, v20.4S // ...................................................................................................................................e.................................................................................................................................................... + + // original source code + // ldr q8, [x1, #0] // ...................................e...............................................................................................................................|.......................................................................................................................................................e..................... + // ldr q9, [x1, #(1*(512/8))] // ...............................e...................................................................................................................................|...................................................................................................................................................e......................... + // ldr q10, [x1, #(2*(512/8))] // ....................................e..............................................................................................................................|........................................................................................................................................................e.................... + // ldr q11, [x1, #(3*(512/8))] // .............................e.....................................................................................................................................|.................................................................................................................................................e........................... + // ldr q12, [x1, #(4*(512/8))] // e..................................................................................................................................................................|....................................................................................................................e........................................................ + // ldr q13, [x1, #(5*(512/8))] // ..e................................................................................................................................................................|......................................................................................................................e...................................................... + // ldr q14, [x1, #(6*(512/8))] // .....e.............................................................................................................................................................|.........................................................................................................................e................................................... + // ldr q15, [x1, #(7*(512/8))] // .......e...........................................................................................................................................................|...........................................................................................................................e................................................. + // ldr q16, [x1, #(8*(512/8))] // .........e.........................................................................................................................................................|.............................................................................................................................e............................................... + // ldr q17, [x1, #(9*(512/8))] // ...............e...................................................................................................................................................|...................................................................................................................................e......................................... + // ldr q18, [x1, #(10*(512/8))] // ..............................e....................................................................................................................................|..................................................................................................................................................e.......................... + // ldr q19, [x1, #(11*(512/8))] // ............................e......................................................................................................................................|................................................................................................................................................e............................ + // ldr q20, [x1, #(12*(512/8))] // .............................................................e.....................................................................................................|............................................................................................................................................................................. + // ldr q21, [x1, #(13*(512/8))] // .......................................................e...........................................................................................................|...........................................................................................................................................................................e. + // ldr q22, [x1, #(14*(512/8))] // ...................................................................e...............................................................................................|............................................................................................................................................................................. + // ldr q23, [x1, #(15*(512/8))] // ..................................................................e................................................................................................|............................................................................................................................................................................. + // sub v24.4s, v8.4s, v9.4s // .................................................................e.................................................................................................|............................................................................................................................................................................. + // add v8.4s, v8.4s, v9.4s // ................................................................e..................................................................................................|............................................................................................................................................................................. + // mul v9.4s, v24.4s, v3.s[2] // .................................................................................e.................................................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[3] // ..................................................................................e................................................................................|............................................................................................................................................................................. + // mls v9.4s, v24.4s, v29.4s // ...........................................................................................e.......................................................................|............................................................................................................................................................................. + // sub v24.4s, v10.4s, v11.4s // ......................................................e............................................................................................................|..........................................................................................................................................................................e.. + // add v10.4s, v10.4s, v11.4s // ...................................................e...............................................................................................................|.......................................................................................................................................................................e..... + // mul v11.4s, v24.4s, v4.s[0] // ..............................................................e....................................................................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v4.s[1] // ...............................................................e...................................................................................................|............................................................................................................................................................................. + // mls v11.4s, v24.4s, v29.4s // .........................................................................e.........................................................................................|............................................................................................................................................................................. + // sub v24.4s, v12.4s, v13.4s // ...................e...............................................................................................................................................|.......................................................................................................................................e..................................... + // add v12.4s, v12.4s, v13.4s // ..................e................................................................................................................................................|......................................................................................................................................e...................................... + // mul v13.4s, v24.4s, v4.s[2] // ............................................e......................................................................................................................|................................................................................................................................................................e............ + // sqrdmulh v24.4s, v24.4s, v4.s[3] // .............................................e.....................................................................................................................|.................................................................................................................................................................e........... + // mls v13.4s, v24.4s, v29.4s // ..........................................................e........................................................................................................|............................................................................................................................................................................. + // sub v24.4s, v14.4s, v15.4s // ......................e............................................................................................................................................|..........................................................................................................................................e.................................. + // add v14.4s, v14.4s, v15.4s // ...........................e.......................................................................................................................................|...............................................................................................................................................e............................. + // mul v15.4s, v24.4s, v5.s[0] // .....................................e.............................................................................................................................|.........................................................................................................................................................e................... + // sqrdmulh v24.4s, v24.4s, v5.s[1] // .................................e.................................................................................................................................|.....................................................................................................................................................e....................... + // mls v15.4s, v24.4s, v29.4s // ....................................................e..............................................................................................................|........................................................................................................................................................................e.... + // sub v24.4s, v16.4s, v17.4s // ........................e..........................................................................................................................................|............................................................................................................................................e................................ + // add v16.4s, v16.4s, v17.4s // ..........................e........................................................................................................................................|..............................................................................................................................................e.............................. + // mul v17.4s, v24.4s, v5.s[2] // ........................................e..........................................................................................................................|............................................................................................................................................................e................ + // sqrdmulh v24.4s, v24.4s, v5.s[3] // ...........................................e.......................................................................................................................|...............................................................................................................................................................e............. + // mls v17.4s, v24.4s, v29.4s // ...........................................................e.......................................................................................................|............................................................................................................................................................................. + // sub v24.4s, v18.4s, v19.4s // .................................................e.................................................................................................................|.....................................................................................................................................................................e....... + // add v18.4s, v18.4s, v19.4s // ...............................................e...................................................................................................................|...................................................................................................................................................................e......... + // mul v19.4s, v24.4s, v6.s[0] // .........................................................e.........................................................................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v6.s[1] // ............................................................e......................................................................................................|............................................................................................................................................................................. + // mls v19.4s, v24.4s, v29.4s // ......................................................................e............................................................................................|............................................................................................................................................................................. + // sub v24.4s, v20.4s, v21.4s // ...........................................................................e.......................................................................................|............................................................................................................................................................................. + // add v20.4s, v20.4s, v21.4s // ..........................................................................e........................................................................................|............................................................................................................................................................................. + // mul v21.4s, v24.4s, v6.s[2] // ...................................................................................e...............................................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v6.s[3] // ....................................................................................e..............................................................................|............................................................................................................................................................................. + // mls v21.4s, v24.4s, v29.4s // .............................................................................................e.....................................................................|............................................................................................................................................................................. + // sub v24.4s, v22.4s, v23.4s // ..............................................................................e....................................................................................|............................................................................................................................................................................. + // add v22.4s, v22.4s, v23.4s // .............................................................................e.....................................................................................|............................................................................................................................................................................. + // mul v23.4s, v24.4s, v7.s[0] // ...............................................................................................e...................................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v7.s[1] // ................................................................................................e..................................................................|............................................................................................................................................................................. + // mls v23.4s, v24.4s, v29.4s // ......................................................................................................e............................................................|............................................................................................................................................................................. + // sub v24.4s, v8.4s, v10.4s // ............................................................................e......................................................................................|............................................................................................................................................................................. + // add v8.4s, v8.4s, v10.4s // ........................................................................e..........................................................................................|............................................................................................................................................................................. + // mul v10.4s, v24.4s, v1.s[2] // .........................................................................................e.........................................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..........................................................................................e........................................................................|............................................................................................................................................................................. + // mls v10.4s, v24.4s, v29.4s // ..................................................................................................e................................................................|............................................................................................................................................................................. + // sub v24.4s, v9.4s, v11.4s // ....................................................................................................e..............................................................|............................................................................................................................................................................. + // add v9.4s, v9.4s, v11.4s // ...................................................................................................e...............................................................|............................................................................................................................................................................. + // mul v11.4s, v24.4s, v1.s[2] // .......................................................................................................e...........................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ........................................................................................................e..........................................................|............................................................................................................................................................................. + // mls v11.4s, v24.4s, v29.4s // ...........................................................................................................e.......................................................|............................................................................................................................................................................. + // sub v24.4s, v12.4s, v14.4s // ..........................................e........................................................................................................................|..............................................................................................................................................................e.............. + // add v12.4s, v12.4s, v14.4s // .......................................e...........................................................................................................................|...........................................................................................................................................................e................. + // mul v14.4s, v24.4s, v2.s[0] // ..................................................e................................................................................................................|......................................................................................................................................................................e...... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .....................................................e.............................................................................................................|.........................................................................................................................................................................e... + // mls v14.4s, v24.4s, v29.4s // ....................................................................e..............................................................................................|............................................................................................................................................................................. + // sub v24.4s, v13.4s, v15.4s // ..............................................................................................e....................................................................|............................................................................................................................................................................. + // add v13.4s, v13.4s, v15.4s // ............................................................................................e......................................................................|............................................................................................................................................................................. + // mul v15.4s, v24.4s, v2.s[0] // ............................................................................................................................e......................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...........................................................................................................................e.......................................|............................................................................................................................................................................. + // mls v15.4s, v24.4s, v29.4s // ..................................................................................................................................e................................|............................................................................................................................................................................. + // sub v24.4s, v16.4s, v18.4s // .....................................................................e.............................................................................................|............................................................................................................................................................................. + // add v16.4s, v16.4s, v18.4s // .......................................................................e...........................................................................................|............................................................................................................................................................................. + // mul v18.4s, v24.4s, v2.s[2] // ..........................................................................................................e........................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[3] // .........................................................................................................e.........................................................|............................................................................................................................................................................. + // mls v18.4s, v24.4s, v29.4s // ............................................................................................................e......................................................|............................................................................................................................................................................. + // sub v24.4s, v17.4s, v19.4s // .....................................................................................................................................e.............................|............................................................................................................................................................................. + // add v17.4s, v17.4s, v19.4s // .........................................................................................................................................e.........................|............................................................................................................................................................................. + // mul v19.4s, v24.4s, v2.s[2] // .................................................................................................................................................e.................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..................................................................................................................................................e................|............................................................................................................................................................................. + // mls v19.4s, v24.4s, v29.4s // .......................................................................................................................................................e...........|............................................................................................................................................................................. + // sub v24.4s, v20.4s, v22.4s // ...............................................................................e...................................................................................|............................................................................................................................................................................. + // add v20.4s, v20.4s, v22.4s // ................................................................................e..................................................................................|............................................................................................................................................................................. + // mul v22.4s, v24.4s, v3.s[0] // .....................................................................................................e.............................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .................................................................................................e.................................................................|............................................................................................................................................................................. + // mls v22.4s, v24.4s, v29.4s // ................................................................................................................e..................................................|............................................................................................................................................................................. + // sub v24.4s, v21.4s, v23.4s // ..............................................................................................................e....................................................|............................................................................................................................................................................. + // add v21.4s, v21.4s, v23.4s // .............................................................................................................e.....................................................|............................................................................................................................................................................. + // mul v23.4s, v24.4s, v3.s[0] // .......................................................................................................................................e...........................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................................................................................................................e..........................|............................................................................................................................................................................. + // mls v23.4s, v24.4s, v29.4s // ...................................................................................................................................................e...............|............................................................................................................................................................................. + // sub v24.4s, v8.4s, v12.4s // .....................................................................................e.............................................................................|............................................................................................................................................................................. + // add v8.4s, v8.4s, v12.4s // ......................................................................................e............................................................................|............................................................................................................................................................................. + // mul v12.4s, v24.4s, v0.s[2] // ....................................................................................................................................e..............................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................................................................................................................e...............................|............................................................................................................................................................................. + // mls v12.4s, v24.4s, v29.4s // ............................................................................................................................................e......................|............................................................................................................................................................................. + // sub v24.4s, v9.4s, v13.4s // ......................................................................................................................e............................................|............................................................................................................................................................................. + // add v9.4s, v9.4s, v13.4s // .........................................................................................................................e.........................................|............................................................................................................................................................................. + // mul v13.4s, v24.4s, v0.s[2] // ....................................................................................................................................................e..............|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....................................................................................................................................................e.............|............................................................................................................................................................................. + // mls v13.4s, v24.4s, v29.4s // ...........................................................................................................................................................e.......|............................................................................................................................................................................. + // sub v24.4s, v10.4s, v14.4s // ...................................................................................................................e...............................................|............................................................................................................................................................................. + // add v10.4s, v10.4s, v14.4s // ..................................................................................................................e................................................|............................................................................................................................................................................. + // mul v14.4s, v24.4s, v0.s[2] // ..........................................................................................................................e........................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................................................................................................e.....................................|............................................................................................................................................................................. + // mls v14.4s, v24.4s, v29.4s // .................................................................................................................................e.................................|............................................................................................................................................................................. + // sub v24.4s, v11.4s, v15.4s // .............................................................................................................................................e.....................|............................................................................................................................................................................. + // add v11.4s, v11.4s, v15.4s // ..............................................................................................................................................e....................|............................................................................................................................................................................. + // mul v15.4s, v24.4s, v0.s[2] // ...................................................................................................................................................................|....*........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................................................................................................................................................|.....*....................................................................................................................................................................... + // mls v15.4s, v24.4s, v29.4s // ...................................................................................................................................................................|...............*............................................................................................................................................................. + // sub v24.4s, v16.4s, v20.4s // .......................................................................................e...........................................................................|............................................................................................................................................................................. + // add v16.4s, v16.4s, v20.4s // ........................................................................................e..........................................................................|............................................................................................................................................................................. + // mul v20.4s, v24.4s, v1.s[0] // ...............................................................................................................................e...................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................................................................................................................................e..................................|............................................................................................................................................................................. + // mls v20.4s, v24.4s, v29.4s // ......................................................................................................................................e............................|............................................................................................................................................................................. + // sub v24.4s, v17.4s, v21.4s // ................................................................................................................................................e..................|............................................................................................................................................................................. + // add v17.4s, v17.4s, v21.4s // ...............................................................................................................................................e...................|............................................................................................................................................................................. + // mul v21.4s, v24.4s, v1.s[0] // .........................................................................................................................................................e.........|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................................................................................................................................................e..........|............................................................................................................................................................................. + // mls v21.4s, v24.4s, v29.4s // ...............................................................................................................................................................e...|............................................................................................................................................................................. + // sub v24.4s, v18.4s, v22.4s // ........................................................................................................................e..........................................|............................................................................................................................................................................. + // add v18.4s, v18.4s, v22.4s // .......................................................................................................................e...........................................|............................................................................................................................................................................. + // mul v22.4s, v24.4s, v1.s[0] // ..........................................................................................................................................e........................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...........................................................................................................................................e.......................|............................................................................................................................................................................. + // mls v22.4s, v24.4s, v29.4s // ......................................................................................................................................................e............|............................................................................................................................................................................. + // sub v24.4s, v19.4s, v23.4s // ..................................................................................................................................................................e|............................................................................................................................................................................. + // add v19.4s, v19.4s, v23.4s // .................................................................................................................................................................e.|............................................................................................................................................................................. + // mul v23.4s, v24.4s, v1.s[0] // ...................................................................................................................................................................|.............*............................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................................................................................|................*............................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // ...................................................................................................................................................................|.........................*................................................................................................................................................... + // sub v24.4s, v8.4s, v16.4s // ...............................................................................................................e...................................................|............................................................................................................................................................................. + // add v8.4s, v8.4s, v16.4s // .................................................................................................................e.................................................|............................................................................................................................................................................. + // mul v16.4s, v24.4s, v0.s[0] // ....................................................................................................................e..............................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................................................................................e.............................................|............................................................................................................................................................................. + // mls v16.4s, v24.4s, v29.4s // ..............................................................................................................................e....................................|............................................................................................................................................................................. + // sub v24.4s, v9.4s, v17.4s // ...................................................................................................................................................................|*............................................................................................................................................................................ + // add v9.4s, v9.4s, v17.4s // ...................................................................................................................................................................|...*......................................................................................................................................................................... + // mul v17.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|........................*.................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|.......................*..................................................................................................................................................... + // mls v17.4s, v24.4s, v29.4s // ...................................................................................................................................................................|............................*................................................................................................................................................ + // sub v24.4s, v10.4s, v18.4s // .............................................................................................................................................................e.....|............................................................................................................................................................................. + // add v10.4s, v10.4s, v18.4s // ............................................................................................................................................................e......|............................................................................................................................................................................. + // mul v18.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|.....................*....................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|......................*...................................................................................................................................................... + // mls v18.4s, v24.4s, v29.4s // ...................................................................................................................................................................|...............................*............................................................................................................................................. + // sub v24.4s, v11.4s, v19.4s // ...................................................................................................................................................................|.................*........................................................................................................................................................... + // add v11.4s, v11.4s, v19.4s // ...................................................................................................................................................................|..................*.......................................................................................................................................................... + // mul v19.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|....................................*........................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|.....................................*....................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // ...................................................................................................................................................................|............................................*................................................................................................................................ + // sub v24.4s, v12.4s, v20.4s // ...................................................................................................................................................................|.*........................................................................................................................................................................... + // add v12.4s, v12.4s, v20.4s // ...................................................................................................................................................................|..*.......................................................................................................................................................................... + // mul v20.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|........*.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|.........*................................................................................................................................................................... + // mls v20.4s, v24.4s, v29.4s // ...................................................................................................................................................................|...................*......................................................................................................................................................... + // sub v24.4s, v13.4s, v21.4s // ...................................................................................................................................................................|......*...................................................................................................................................................................... + // add v13.4s, v13.4s, v21.4s // ...................................................................................................................................................................|.......*..................................................................................................................................................................... + // mul v21.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|....................*........................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|..............*.............................................................................................................................................................. + // mls v21.4s, v24.4s, v29.4s // ...................................................................................................................................................................|...........................*................................................................................................................................................. + // sub v24.4s, v14.4s, v22.4s // ...................................................................................................................................................................|..........*.................................................................................................................................................................. + // add v14.4s, v14.4s, v22.4s // ...................................................................................................................................................................|...........*................................................................................................................................................................. + // mul v22.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|..........................*.................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|................................*............................................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // ...................................................................................................................................................................|..........................................*.................................................................................................................................. + // sub v24.4s, v15.4s, v23.4s // ...................................................................................................................................................................|..................................*.......................................................................................................................................... + // add v15.4s, v15.4s, v23.4s // ...................................................................................................................................................................|.................................*........................................................................................................................................... + // mul v23.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|......................................*...................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|.......................................*..................................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // ...................................................................................................................................................................|...............................................*............................................................................................................................. + // cmge v27.4s, v31.4s, v16.4s // ..........................................................................................................................................................e........|............................................................................................................................................................................. + // cmge v28.4s, v16.4s, v30.4s // ..............................................................................................................................................................e....|............................................................................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ................................................................................................................................................................e..|............................................................................................................................................................................. + // mls v16.4s, v28.4s, v29.4s // ...................................................................................................................................................................*............................................................................................................................................................................. + // cmge v27.4s, v31.4s, v17.4s // ...................................................................................................................................................................|.................................................*........................................................................................................................... + // cmge v28.4s, v17.4s, v30.4s // ...................................................................................................................................................................|..................................................*.......................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|.....................................................*....................................................................................................................... + // mls v17.4s, v28.4s, v29.4s // ...................................................................................................................................................................|...........................................................*................................................................................................................. + // cmge v27.4s, v31.4s, v18.4s // ...................................................................................................................................................................|...........................................*................................................................................................................................. + // cmge v28.4s, v18.4s, v30.4s // ...................................................................................................................................................................|.........................................*................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|................................................*............................................................................................................................ + // mls v18.4s, v28.4s, v29.4s // ...................................................................................................................................................................|.......................................................*..................................................................................................................... + // cmge v27.4s, v31.4s, v19.4s // ...................................................................................................................................................................|..........................................................*.................................................................................................................. + // cmge v28.4s, v19.4s, v30.4s // ...................................................................................................................................................................|......................................................*...................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|.............................................................*............................................................................................................... + // mls v19.4s, v28.4s, v29.4s // ...................................................................................................................................................................|.........................................................................*................................................................................................... + // cmge v27.4s, v31.4s, v20.4s // ...................................................................................................................................................................|.............................*............................................................................................................................................... + // cmge v28.4s, v20.4s, v30.4s // ...................................................................................................................................................................|..............................*.............................................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|...................................*......................................................................................................................................... + // mls v20.4s, v28.4s, v29.4s // ...................................................................................................................................................................|........................................*.................................................................................................................................... + // cmge v27.4s, v31.4s, v21.4s // ...................................................................................................................................................................|.......................................................................................*..................................................................................... + // cmge v28.4s, v21.4s, v30.4s // ...................................................................................................................................................................|..............................................................................*.............................................................................................. + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|...........................................................................................*................................................................................. + // mls v21.4s, v28.4s, v29.4s // ...................................................................................................................................................................|......................................................................................................*...................................................................... + // cmge v27.4s, v31.4s, v22.4s // ...................................................................................................................................................................|....................................................................*........................................................................................................ + // cmge v28.4s, v22.4s, v30.4s // ...................................................................................................................................................................|...................................................................*......................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|............................................................................*................................................................................................ + // mls v22.4s, v28.4s, v29.4s // ...................................................................................................................................................................|....................................................................................*........................................................................................ + // cmge v27.4s, v31.4s, v23.4s // ...................................................................................................................................................................|........................................................*.................................................................................................................... + // cmge v28.4s, v23.4s, v30.4s // ...................................................................................................................................................................|.........................................................*................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|...............................................................*............................................................................................................. + // mls v23.4s, v28.4s, v29.4s // ...................................................................................................................................................................|.....................................................................*....................................................................................................... + // str q16, [x1, #(8*(512/8))] // ...................................................................................................................................................................|............*................................................................................................................................................................ + // str q17, [x1, #(9*(512/8))] // ...................................................................................................................................................................|.......................................................................*..................................................................................................... + // str q18, [x1, #(10*(512/8))] // ...................................................................................................................................................................|.................................................................*........................................................................................................... + // str q19, [x1, #(11*(512/8))] // ...................................................................................................................................................................|...................................................................................*......................................................................................... + // str q20, [x1, #(12*(512/8))] // ...................................................................................................................................................................|...................................................*......................................................................................................................... + // str q21, [x1, #(13*(512/8))] // ...................................................................................................................................................................|.................................................................................................................*........................................................... + // str q22, [x1, #(14*(512/8))] // ...................................................................................................................................................................|.................................................................................................*........................................................................... + // str q23, [x1, #(15*(512/8))] // ...................................................................................................................................................................|..................................................................................*.......................................................................................... + // mul v16.4s, v8.4s, v25.4s // ...................................................................................................................................................................|.............................................*............................................................................................................................... + // sqrdmulh v8.4s, v8.4s, v26.4s // ...................................................................................................................................................................|..............................................*.............................................................................................................................. + // mls v16.4s, v8.4s, v29.4s // ...................................................................................................................................................................|....................................................*........................................................................................................................ + // mul v17.4s, v9.4s, v25.4s // ...................................................................................................................................................................|........................................................................*.................................................................................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ...................................................................................................................................................................|..........................................................................*.................................................................................................. + // mls v17.4s, v9.4s, v29.4s // ...................................................................................................................................................................|....................................................................................................*........................................................................ + // mul v18.4s, v10.4s, v25.4s // ...................................................................................................................................................................|.............................................................................*............................................................................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ...................................................................................................................................................................|......................................................................*...................................................................................................... + // mls v18.4s, v10.4s, v29.4s // ...................................................................................................................................................................|.............................................................................................*............................................................................... + // mul v19.4s, v11.4s, v25.4s // ...................................................................................................................................................................|........................................................................................*.................................................................................... + // sqrdmulh v11.4s, v11.4s, v26.4s // ...................................................................................................................................................................|...............................................................................*............................................................................................. + // mls v19.4s, v11.4s, v29.4s // ...................................................................................................................................................................|.......................................................................................................*..................................................................... + // mul v20.4s, v12.4s, v25.4s // ...................................................................................................................................................................|................................................................*............................................................................................................ + // sqrdmulh v12.4s, v12.4s, v26.4s // ...................................................................................................................................................................|..................................................................*.......................................................................................................... + // mls v20.4s, v12.4s, v29.4s // ...................................................................................................................................................................|.................................................................................*........................................................................................... + // mul v21.4s, v13.4s, v25.4s // ...................................................................................................................................................................|............................................................*................................................................................................................ + // sqrdmulh v13.4s, v13.4s, v26.4s // ...................................................................................................................................................................|..............................................................*.............................................................................................................. + // mls v21.4s, v13.4s, v29.4s // ...................................................................................................................................................................|...........................................................................*................................................................................................. + // mul v22.4s, v14.4s, v25.4s // ...................................................................................................................................................................|..................................................................................................*.......................................................................... + // sqrdmulh v14.4s, v14.4s, v26.4s // ...................................................................................................................................................................|..............................................................................................*.............................................................................. + // mls v22.4s, v14.4s, v29.4s // ...................................................................................................................................................................|..............................................................................................................*.............................................................. + // mul v23.4s, v15.4s, v25.4s // ...................................................................................................................................................................|.....................................................................................*....................................................................................... + // sqrdmulh v15.4s, v15.4s, v26.4s // ...................................................................................................................................................................|................................................................................*............................................................................................ + // mls v23.4s, v15.4s, v29.4s // ...................................................................................................................................................................|...................................................................................................*......................................................................... + // cmge v27.4s, v31.4s, v16.4s // ...................................................................................................................................................................|..........................................................................................*.................................................................................. + // cmge v28.4s, v16.4s, v30.4s // ...................................................................................................................................................................|......................................................................................*...................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|...............................................................................................*............................................................................. + // mls v16.4s, v28.4s, v29.4s // ...................................................................................................................................................................|..........................................................................................................*.................................................................. + // cmge v27.4s, v31.4s, v17.4s // ...................................................................................................................................................................|...................................................................................................................*......................................................... + // cmge v28.4s, v17.4s, v30.4s // .*.................................................................................................................................................................|.....................................................................................................................*....................................................... + // sub v28.4s, v27.4s, v28.4s // ...........*.......................................................................................................................................................|...............................................................................................................................*............................................. + // mls v17.4s, v28.4s, v29.4s // .................*.................................................................................................................................................|.....................................................................................................................................*....................................... + // cmge v27.4s, v31.4s, v18.4s // ...................................................................................................................................................................|...........................................................................................................*................................................................. + // cmge v28.4s, v18.4s, v30.4s // ...................................................................................................................................................................|........................................................................................................*.................................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|................................................................................................................*............................................................ + // mls v18.4s, v28.4s, v29.4s // ..........*........................................................................................................................................................|..............................................................................................................................*.............................................. + // cmge v27.4s, v31.4s, v19.4s // .............*.....................................................................................................................................................|.................................................................................................................................*........................................... + // cmge v28.4s, v19.4s, v30.4s // ........*..........................................................................................................................................................|............................................................................................................................*................................................ + // sub v28.4s, v27.4s, v28.4s // ....................*..............................................................................................................................................|........................................................................................................................................*.................................... + // mls v19.4s, v28.4s, v29.4s // ................................*..................................................................................................................................|....................................................................................................................................................*........................ + // cmge v27.4s, v31.4s, v20.4s // ...................................................................................................................................................................|............................................................................................*................................................................................ + // cmge v28.4s, v20.4s, v30.4s // ...................................................................................................................................................................|.........................................................................................*................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|.....................................................................................................*....................................................................... + // mls v20.4s, v28.4s, v29.4s // ...*...............................................................................................................................................................|.......................................................................................................................*..................................................... + // cmge v27.4s, v31.4s, v21.4s // ...................................................................................................................................................................|.........................................................................................................*................................................................... + // cmge v28.4s, v21.4s, v30.4s // ...................................................................................................................................................................|................................................................................................*............................................................................ + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|...............................................................................................................*............................................................. + // mls v21.4s, v28.4s, v29.4s // ................*..................................................................................................................................................|....................................................................................................................................*........................................ + // cmge v27.4s, v31.4s, v22.4s // ............*......................................................................................................................................................|................................................................................................................................*............................................ + // cmge v28.4s, v22.4s, v30.4s // ......*............................................................................................................................................................|..........................................................................................................................*.................................................. + // sub v28.4s, v27.4s, v28.4s // .......................*...........................................................................................................................................|...........................................................................................................................................*................................. + // mls v22.4s, v28.4s, v29.4s // ..............................................*....................................................................................................................|..................................................................................................................................................................*.......... + // cmge v27.4s, v31.4s, v23.4s // ...................................................................................................................................................................|.............................................................................................................*............................................................... + // cmge v28.4s, v23.4s, v30.4s // ...................................................................................................................................................................|............................................................................................................*................................................................ + // sub v28.4s, v27.4s, v28.4s // ....*..............................................................................................................................................................|........................................................................................................................*.................................................... + // mls v23.4s, v28.4s, v29.4s // ..............*....................................................................................................................................................|..................................................................................................................................*.......................................... + // str q16, [x1], #(16) // ...................................................................................................................................................................|..................................................................................................................*.......................................................... + // str q17, [x1, #(-16 + 1*(512/8))] // ..................................*................................................................................................................................|......................................................................................................................................................*...................... + // str q18, [x1, #(-16 + 2*(512/8))] // .....................*.............................................................................................................................................|.........................................................................................................................................*................................... + // str q19, [x1, #(-16 + 3*(512/8))] // ................................................*..................................................................................................................|....................................................................................................................................................................*........ + // str q20, [x1, #(-16 + 4*(512/8))] // .........................*.........................................................................................................................................|.............................................................................................................................................*............................... + // str q21, [x1, #(-16 + 5*(512/8))] // ......................................*............................................................................................................................|..........................................................................................................................................................*.................. + // str q22, [x1, #(-16 + 6*(512/8))] // ........................................................*..........................................................................................................|............................................................................................................................................................................* + // str q23, [x1, #(-16 + 7*(512/8))] // .........................................*.........................................................................................................................|.............................................................................................................................................................*............... + + sub count, count, #1 + cbnz count, layer1234_start + mls v16.4S, v17.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sub v17.4S, v22.4S, v18.4S // .............................................................................................................................................*.......................................................................................................................................... + sub v28.4S, v9.4S, v12.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v9.4S, v12.4S // .............................................................................................................................................................*.......................................................................................................................... + add v9.4S, v22.4S, v18.4S // ..............................................................................................................................................*......................................................................................................................................... + mul v18.4S, v19.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v19.4S, v19.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + sub v20.4S, v11.4S, v27.4S // .......................................................................................................................................................*................................................................................................................................ + sub v22.4S, v24.4S, v14.4S // ......................................................................................................................................................................*................................................................................................................. + add v14.4S, v24.4S, v14.4S // .......................................................................................................................................................................*................................................................................................................ + sqrdmulh v24.4S, v17.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + mul v17.4S, v17.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + mul v16.4S, v13.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + add v11.4S, v11.4S, v27.4S // ........................................................................................................................................................*............................................................................................................................... + sqrdmulh v13.4S, v13.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + sqrdmulh v27.4S, v20.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mls v18.4S, v19.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + mul v19.4S, v20.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v20.4S, v23.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mul v23.4S, v23.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + mls v17.4S, v24.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + sqrdmulh v24.4S, v22.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mul v22.4S, v22.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + mls v16.4S, v13.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + add v13.4S, v15.4S, v21.4S // ..................................................................................................................................................................*..................................................................................................................... + sub v21.4S, v15.4S, v21.4S // .................................................................................................................................................................*...................................................................................................................... + mls v19.4S, v27.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + mls v23.4S, v20.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mls v22.4S, v24.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + sqrdmulh v24.4S, v28.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + mul v28.4S, v28.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + mul v27.4S, v21.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + sqrdmulh v20.4S, v21.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + cmge v15.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + cmge v21.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + mls v28.4S, v24.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + sub v24.4S, v21.4S, v15.4S // ......................................................................................................................................................................................*................................................................................................. + add v15.4S, v16.4S, v23.4S // ............................................................................................................................................................................*........................................................................................................... + sub v23.4S, v16.4S, v23.4S // ...........................................................................................................................................................................*............................................................................................................ + mls v27.4S, v20.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v21.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + cmge v20.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + sqrdmulh v16.4S, v23.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + mls v17.4S, v24.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + mul v23.4S, v23.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sub v20.4S, v21.4S, v20.4S // ..........................................................................................................................................................................................*............................................................................................. + cmge v24.4S, v28.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + cmge v21.4S, v31.4S, v28.4S // ................................................................................................................................................................................................*....................................................................................... + mls v18.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + sub v24.4S, v21.4S, v24.4S // ..................................................................................................................................................................................................*..................................................................................... + cmge v20.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... + cmge v17.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + mls v23.4S, v16.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sub v17.4S, v20.4S, v17.4S // ..........................................................................................................................................................................................................*............................................................................. + str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + sqrdmulh v21.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + cmge v18.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... + cmge v20.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + cmge v8.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + mls v28.4S, v24.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + mls v22.4S, v17.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + sub v20.4S, v18.4S, v20.4S // ..............................................................................................................................................................................................*......................................................................................... + mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + cmge v24.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + mls v16.4S, v21.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + sqrdmulh v10.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + str q28, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + sub v28.4S, v24.4S, v8.4S // ..............................................................................................................................................................................................................*......................................................................... + sqrdmulh v24.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + sqrdmulh v13.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mls v19.4S, v20.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + cmge v8.4S, v27.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + sqrdmulh v9.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + mls v18.4S, v10.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + sqrdmulh v14.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + cmge v10.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + sqrdmulh v28.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + cmge v11.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + cmge v15.4S, v31.4S, v27.4S // ....................................................................................................................................................................................................*................................................................................... + sqrdmulh v12.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + mls v17.4S, v13.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + mls v22.4S, v9.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + cmge v9.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + cmge v13.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... + mls v21.4S, v24.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + sub v10.4S, v10.4S, v11.4S // ..................................................................................................................................................................................................................................................*..................................... + mls v19.4S, v14.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + sub v15.4S, v15.4S, v8.4S // ......................................................................................................................................................................................................*................................................................................. + mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + sub v24.4S, v13.4S, v9.4S // ..........................................................................................................................................................................................................................................................*............................. + cmge v14.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + cmge v9.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + mls v20.4S, v12.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + mls v16.4S, v10.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + cmge v13.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + mls v27.4S, v15.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + cmge v15.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + cmge v11.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + cmge v12.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + str q27, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + cmge v27.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... + cmge v16.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v10.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v8.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + sub v15.4S, v15.4S, v13.4S // ......................................................................................................................................................................................................................................................................*................. + sub v11.4S, v12.4S, v11.4S // ..............................................................................................................................................................................................................................................................*......................... + cmge v28.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + cmge v13.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + sub v14.4S, v14.4S, v9.4S // ......................................................................................................................................................................................................................................................*................................. + mls v18.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + sub v9.4S, v27.4S, v16.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v21.4S, v15.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + mls v19.4S, v11.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + sub v16.4S, v10.4S, v28.4S // ..........................................................................................................................................................................................................................................................................*............. + sub v27.4S, v8.4S, v13.4S // ..................................................................................................................................................................................................................................................................*..................... + mls v23.4S, v9.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + mls v17.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + mls v22.4S, v16.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + mls v20.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + str q23, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s new file mode 100644 index 0000000..53d25aa --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s @@ -0,0 +1,1710 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm + .global _intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm: +_intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 + // gap // ................................................. + ldr q6, [x0, #32] // ..*.............................................. + ldr q18, [x0, #48] // ...*............................................. + // gap // ................................................. + ldr q11, [x0, #16] // .*............................................... + ldr q5, [x0, #0] // *................................................ + // gap // ................................................. + // gap // ................................................. + ldr q4, [x3, #64] // ..........*...................................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + ldr q1, [x3, #80] // ....*............................................ + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn2 v12.4S, v6.4S, v18.4S // .......*......................................... + trn1 v7.4S, v6.4S, v18.4S // .........*....................................... + ldr q28, [x3, #48] // ........*........................................ + // gap // ................................................. + trn2 v3.4S, v5.4S, v11.4S // ......*.......................................... + trn1 v16.4S, v5.4S, v11.4S // .....*........................................... + ldr q5, [x3, #32] // ...........*..................................... + // gap // ................................................. + ldr q30, [x4], #8 // ...................................*............. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn2 v27.2D, v16.2D, v7.2D // ............*.................................... + ldr q19, [x3, #16] // ....................*............................ + trn2 v14.2D, v3.2D, v12.2D // .............*................................... + // gap // ................................................. + trn1 v22.2D, v3.2D, v12.2D // ...............*................................. + trn1 v10.2D, v16.2D, v7.2D // ..............*.................................. + ldr q12, [x3], #(6*16) // .................*............................... + // gap // ................................................. + sub v2.4S, v27.4S, v14.4S // ................*................................ + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sub v17.4S, v10.4S, v22.4S // ...................*............................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mul v21.4S, v2.4S, v4.4S // .......................*......................... + sqrdmulh v11.4S, v2.4S, v1.4S // ......................*.......................... + // gap // ................................................. + // gap // ................................................. + sqrdmulh v24.4S, v17.4S, v28.4S // ........................*........................ + mul v28.4S, v17.4S, v5.4S // .........................*....................... + ldr q17, [x4], #16 // ....................................*............ + // gap // ................................................. + add v8.4S, v27.4S, v14.4S // ..................*.............................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + add v9.4S, v10.4S, v22.4S // .....................*........................... + mls v21.4S, v11.4S, v29.4S // ..........................*...................... + // gap // ................................................. + // gap // ................................................. + mls v28.4S, v24.4S, v29.4S // ...........................*..................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sub v23.4S, v9.4S, v8.4S // ............................*.................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + add v9.4S, v9.4S, v8.4S // .....................................*........... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sub v26.4S, v28.4S, v21.4S // .............................*................... + sqrdmulh v6.4S, v23.4S, v19.4S // ..............................*.................. + // gap // ................................................. + // gap // ................................................. + mul v5.4S, v23.4S, v12.4S // ...............................*................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sqrdmulh v2.4S, v26.4S, v19.4S // .................................*............... + mul v20.4S, v26.4S, v12.4S // ................................*................ + // gap // ................................................. + // gap // ................................................. + add v13.4S, v28.4S, v21.4S // ..................................*.............. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mls v5.4S, v6.4S, v29.4S // ......................................*.......... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mls v20.4S, v2.4S, v29.4S // .......................................*......... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn1 v21.4S, v9.4S, v13.4S // ........................................*........ + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn2 v2.4S, v9.4S, v13.4S // .........................................*....... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn2 v22.4S, v5.4S, v20.4S // ...........................................*..... + trn1 v14.4S, v5.4S, v20.4S // ..........................................*...... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn1 v18.2D, v2.2D, v22.2D // ............................................*.... + trn1 v31.2D, v21.2D, v14.2D // .............................................*... + // gap // ................................................. + // gap // ................................................. + trn2 v5.2D, v2.2D, v22.2D // ...............................................*. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn2 v22.2D, v21.2D, v14.2D // ..............................................*.. + sub v25.4S, v31.4S, v18.4S // ................................................* + // gap // ................................................. + // gap // ................................................. + + // original source code + // ldr q14, [x0, #0] // ...*............................................. + // ldr q15, [x0, #16] // ..*.............................................. + // ldr q18, [x0, #32] // *................................................ + // ldr q13, [x0, #48] // .*............................................... + // ldr q24, [x3, #80] // .....*........................................... + // trn1 v12.4S, v14.4S, v15.4S // ..........*...................................... + // trn2 v31.4S, v14.4S, v15.4S // .........*....................................... + // trn2 v15.4S, v18.4S, v13.4S // ......*.......................................... + // ldr q28, [x3, #48] // ........*........................................ + // trn1 v14.4S, v18.4S, v13.4S // .......*......................................... + // ldr q21, [x3, #64] // ....*............................................ + // ldr q23, [x3, #32] // ...........*..................................... + // trn2 v25.2D, v12.2D, v14.2D // .............*................................... + // trn2 v27.2D, v31.2D, v15.2D // ...............*................................. + // trn1 v19.2D, v12.2D, v14.2D // .................*............................... + // trn1 v1.2D, v31.2D, v15.2D // ................*................................ + // sub v11.4S, v25.4S, v27.4S // ...................*............................. + // ldr q20, [x3], #(6*16) // ..................*.............................. + // add v3.4S, v25.4S, v27.4S // ..........................*...................... + // sub v18.4S, v19.4S, v1.4S // ....................*............................ + // ldr q6, [x3, #-80] // ..............*.................................. + // add v7.4S, v19.4S, v1.4S // ...........................*..................... + // sqrdmulh v31.4S, v11.4S, v24.4S // ......................*.......................... + // mul v19.4S, v11.4S, v21.4S // .....................*........................... + // sqrdmulh v28.4S, v18.4S, v28.4S // .......................*......................... + // mul v14.4S, v18.4S, v23.4S // ........................*........................ + // mls v19.4S, v31.4S, v29.4S // ............................*.................... + // mls v14.4S, v28.4S, v29.4S // .............................*................... + // sub v16.4S, v7.4S, v3.4S // ..............................*.................. + // sub v31.4S, v14.4S, v19.4S // ................................*................ + // sqrdmulh v25.4S, v16.4S, v6.4S // .................................*............... + // mul v5.4S, v16.4S, v20.4S // ..................................*.............. + // mul v8.4S, v31.4S, v20.4S // ....................................*............ + // sqrdmulh v1.4S, v31.4S, v6.4S // ...................................*............. + // add v31.4S, v14.4S, v19.4S // .....................................*........... + // ldr q30, [x4], #8 // ............*.................................... + // ldr q17, [x4], #16 // .........................*....................... + // add v6.4S, v7.4S, v3.4S // ...............................*................. + // mls v5.4S, v25.4S, v29.4S // ......................................*.......... + // mls v8.4S, v1.4S, v29.4S // .......................................*......... + // trn1 v7.4S, v6.4S, v31.4S // ........................................*........ + // trn2 v4.4S, v6.4S, v31.4S // .........................................*....... + // trn1 v9.4S, v5.4S, v8.4S // ...........................................*..... + // trn2 v14.4S, v5.4S, v8.4S // ..........................................*...... + // trn1 v18.2D, v4.2D, v14.2D // ............................................*.... + // trn1 v31.2D, v7.2D, v9.2D // .............................................*... + // trn2 v22.2D, v7.2D, v9.2D // ...............................................*. + // trn2 v5.2D, v4.2D, v14.2D // ..............................................*.. + // sub v25.4S, v31.4S, v18.4S // ................................................* + + sub count, count, #1 +layer5678_start: + ldr q14, [x0, #64] // e........................................................................... + add v10.4S, v31.4S, v18.4S // .................................................*.......................... + ldr q15, [x0, #80] // .e.......................................................................... + sub v0.4S, v22.4S, v5.4S // .....................................................*...................... + ldr q18, [x0, #96] // ..e......................................................................... + sqrdmulh v2.4S, v25.4S, v17.S[1] // ...................................................*........................ + add v5.4S, v22.4S, v5.4S // ......................................................*..................... + ldr q13, [x0, #112] // ...e........................................................................ + mul v8.4S, v25.4S, v17.S[0] // ..................................................*......................... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v19.4S, v0.4S, v17.S[3] // ........................................................*................... + mul v9.4S, v0.4S, v17.S[2] // .......................................................*.................... + add v0.4S, v10.4S, v5.4S // ...........................................................*................ + // gap // ............................................................................ + ldr q24, [x3, #80] // .................e.......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v12.4S, v14.4S, v15.4S // ....e....................................................................... + trn2 v31.4S, v14.4S, v15.4S // .....e...................................................................... + trn2 v15.4S, v18.4S, v13.4S // .......e.................................................................... + // gap // ............................................................................ + ldr q28, [x3, #48] // ...............e............................................................ + trn1 v14.4S, v18.4S, v13.4S // ......e..................................................................... + mls v9.4S, v19.4S, v29.4S // .........................................................*.................. + ldr q21, [x3, #64] // ................e........................................................... + ldr q23, [x3, #32] // ..............e............................................................. + mls v8.4S, v2.4S, v29.4S // ....................................................*....................... + // gap // ............................................................................ + trn2 v25.2D, v12.2D, v14.2D // ........e................................................................... + // gap // ............................................................................ + trn2 v27.2D, v31.2D, v15.2D // .........e.................................................................. + // gap // ............................................................................ + trn1 v19.2D, v12.2D, v14.2D // ..........e................................................................. + trn1 v1.2D, v31.2D, v15.2D // ...........e................................................................ + // gap // ............................................................................ + sub v11.4S, v25.4S, v27.4S // .......................e.................................................... + // gap // ............................................................................ + ldr q20, [x3], #(6*16) // ............e............................................................... + add v3.4S, v25.4S, v27.4S // ........................e................................................... + sub v18.4S, v19.4S, v1.4S // ..................e......................................................... + ldr q6, [x3, #-80] // .............e.............................................................. + add v7.4S, v19.4S, v1.4S // ...................e........................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v31.4S, v11.4S, v24.4S // ..........................e................................................. + mul v19.4S, v11.4S, v21.4S // .........................e.................................................. + // gap // ............................................................................ + sqrdmulh v28.4S, v18.4S, v28.4S // .....................e...................................................... + mul v14.4S, v18.4S, v23.4S // ....................e....................................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v2.4S, v8.4S, v9.4S // ...............................................................*............ + // gap // ............................................................................ + // gap // ............................................................................ + sub v15.4S, v10.4S, v5.4S // ..........................................................*................. + // gap // ............................................................................ + // gap // ............................................................................ + mls v19.4S, v31.4S, v29.4S // ...........................e................................................ + add v12.4S, v8.4S, v9.4S // ................................................................*........... + mls v14.4S, v28.4S, v29.4S // ......................e..................................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v16.4S, v7.4S, v3.4S // ............................e............................................... + srshr v9.4S, v12.4S, #23 // ......................................................................*..... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v27.4S, v15.4S, v30.S[1] // .............................................................*.............. + // gap // ............................................................................ + // gap // ............................................................................ + srshr v18.4S, v0.4S, #23 // ....................................................................*....... + mul v13.4S, v2.4S, v30.S[0] // .................................................................*.......... + // gap // ............................................................................ + // gap // ............................................................................ + sub v31.4S, v14.4S, v19.4S // .................................e.......................................... + sqrdmulh v25.4S, v16.4S, v6.4S // ...............................e............................................ + mls v12.4S, v9.4S, v29.4S // .......................................................................*.... + // gap // ............................................................................ + // gap // ............................................................................ + mul v5.4S, v16.4S, v20.4S // ..............................e............................................. + mul v8.4S, v31.4S, v20.4S // ...................................e........................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v1.4S, v31.4S, v6.4S // ....................................e....................................... + add v31.4S, v14.4S, v19.4S // ..................................e......................................... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v2.4S, v2.4S, v30.S[1] // ..................................................................*......... + mul v19.4S, v15.4S, v30.S[0] // ............................................................*............... + ldr q30, [x4], #8 // ..............................................e............................. + ldr q17, [x4], #16 // ...............................................e............................ + add v6.4S, v7.4S, v3.4S // .............................e.............................................. + mls v5.4S, v25.4S, v29.4S // ................................e........................................... + // gap // ............................................................................ + // gap // ............................................................................ + mls v8.4S, v1.4S, v29.4S // .....................................e...................................... + // gap // ............................................................................ + mls v0.4S, v18.4S, v29.4S // .....................................................................*...... + // gap // ............................................................................ + mls v13.4S, v2.4S, v29.4S // ...................................................................*........ + trn1 v7.4S, v6.4S, v31.4S // ......................................e..................................... + trn2 v4.4S, v6.4S, v31.4S // .......................................e.................................... + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v9.4S, v5.4S, v8.4S // ........................................e................................... + trn2 v14.4S, v5.4S, v8.4S // .........................................e.................................. + // gap // ............................................................................ + // gap // ............................................................................ + str q0, [x0], #(16*4) // ........................................................................*... + mls v19.4S, v27.4S, v29.4S // ..............................................................*............. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v18.2D, v4.2D, v14.2D // .............................................e.............................. + trn1 v31.2D, v7.2D, v9.2D // ............................................e............................... + str q12, [x0, #-48] // .........................................................................*.. + trn2 v22.2D, v7.2D, v9.2D // ..........................................e................................. + trn2 v5.2D, v4.2D, v14.2D // ...........................................e................................ + str q13, [x0, #-16] // ...........................................................................* + // gap // ............................................................................ + sub v25.4S, v31.4S, v18.4S // ................................................e........................... + str q19, [x0, #-32] // ..........................................................................*. + // gap // ............................................................................ + // gap // ............................................................................ + + // original source code + // ldr q8, [x0, #(16*0)] // e...........................................................................e........................................................................... + // ldr q9, [x0, #(16*1)] // ..e.........................................................................|.e......................................................................... + // ldr q10, [x0, #(16*2)] // ....e.......................................................................|...e....................................................................... + // ldr q11, [x0, #(16*3)] // .......e....................................................................|......e.................................................................... + // trn1 v25.4s, v8.4s, v9.4s // .............e..............................................................|............e.............................................................. + // trn2 v26.4s, v8.4s, v9.4s // ..............e.............................................................|.............e............................................................. + // trn1 v27.4s, v10.4s, v11.4s // .................e..........................................................|................e.......................................................... + // trn2 v28.4s, v10.4s, v11.4s // ...............e............................................................|..............e............................................................ + // trn2 v10.2d, v25.2d, v27.2d // ......................e.....................................................|.....................e..................................................... + // trn2 v11.2d, v26.2d, v28.2d // .......................e....................................................|......................e.................................................... + // trn1 v8.2d, v25.2d, v27.2d // ........................e...................................................|.......................e................................................... + // trn1 v9.2d, v26.2d, v28.2d // .........................e..................................................|........................e.................................................. + // ldr q0, [x3], #(6*16) // ...........................e................................................|..........................e................................................ + // ldr q4, [x3, #(-6*16 + 1*16)] // ..............................e.............................................|.............................e............................................. + // ldr q1, [x3, #(-6*16 + 2*16)] // ....................e.......................................................|...................e....................................................... + // ldr q5, [x3, #(-6*16 + 3*16)] // ................e...........................................................|...............e........................................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // ...................e........................................................|..................e........................................................ + // ldr q6, [x3, #(-6*16 + 5*16)] // ............e...............................................................|...........e............................................................... + // sub v24.4s, v8.4s, v9.4s // .............................e..............................................|............................e.............................................. + // add v8.4s, v8.4s, v9.4s // ...............................e............................................|..............................e............................................ + // mul v9.4s, v24.4s, v1.4s // ...................................e........................................|..................................e........................................ + // sqrdmulh v24.4s, v24.4s, v5.4s // ..................................e.........................................|.................................e......................................... + // mls v9.4s, v24.4s, v29.4s // ........................................e...................................|.......................................e................................... + // sub v24.4s, v10.4s, v11.4s // ..........................e.................................................|.........................e................................................. + // add v10.4s, v10.4s, v11.4s // ............................e...............................................|...........................e............................................... + // mul v11.4s, v24.4s, v2.4s // .................................e..........................................|................................e.......................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ................................e...........................................|...............................e........................................... + // mls v11.4s, v24.4s, v29.4s // ......................................e.....................................|.....................................e..................................... + // sub v24.4s, v8.4s, v10.4s // .........................................e..................................|........................................e.................................. + // add v8.4s, v8.4s, v10.4s // .........................................................e..................|........................................................e.................. + // mul v10.4s, v24.4s, v0.4s // .................................................e..........................|................................................e.......................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................................e............................|..............................................e............................ + // mls v10.4s, v24.4s, v29.4s // ..........................................................e.................|.........................................................e................. + // sub v24.4s, v9.4s, v11.4s // ..............................................e.............................|.............................................e............................. + // add v9.4s, v9.4s, v11.4s // ....................................................e.......................|...................................................e....................... + // mul v11.4s, v24.4s, v0.4s // ..................................................e.........................|.................................................e......................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................................e........................|..................................................e........................ + // mls v11.4s, v24.4s, v29.4s // ...........................................................e................|..........................................................e................ + // trn1 v25.4s, v8.4s, v9.4s // ..............................................................e.............|.............................................................e............. + // trn2 v26.4s, v8.4s, v9.4s // ...............................................................e............|..............................................................e............ + // trn1 v27.4s, v10.4s, v11.4s // ................................................................e...........|...............................................................e........... + // trn2 v28.4s, v10.4s, v11.4s // .................................................................e..........|................................................................e.......... + // trn2 v10.2d, v25.2d, v27.2d // .......................................................................e....|......................................................................e.... + // trn2 v11.2d, v26.2d, v28.2d // ........................................................................e...|.......................................................................e... + // trn1 v8.2d, v25.2d, v27.2d // .....................................................................e......|....................................................................e...... + // trn1 v9.2d, v26.2d, v28.2d // ....................................................................e.......|...................................................................e....... + // ldr q1, [x4], #8 // .......................................................e....................|......................................................e.................... + // ldr q0, [x4], #16 // ........................................................e...................|.......................................................e................... + // sub v24.4s, v8.4s, v9.4s // ..........................................................................e.|.........................................................................e. + // add v8.4s, v8.4s, v9.4s // .*..........................................................................|*.......................................................................... + // mul v9.4s, v24.4s, v0.s[0] // ........*...................................................................|.......*................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....*......................................................................|....*...................................................................... + // mls v9.4s, v24.4s, v29.4s // .....................*......................................................|....................*...................................................... + // sub v24.4s, v10.4s, v11.4s // ...*........................................................................|..*........................................................................ + // add v10.4s, v10.4s, v11.4s // ......*.....................................................................|.....*..................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ..........*.................................................................|.........*................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........*..................................................................|........*.................................................................. + // mls v11.4s, v24.4s, v29.4s // ..................*.........................................................|.................*......................................................... + // sub v24.4s, v8.4s, v10.4s // .....................................*......................................|....................................*...................................... + // add v8.4s, v8.4s, v10.4s // ...........*................................................................|..........*................................................................ + // mul v10.4s, v24.4s, v1.s[0] // ......................................................*.....................|.....................................................*..................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...........................................*................................|..........................................*................................ + // mls v10.4s, v24.4s, v29.4s // ...................................................................*........|..................................................................*........ + // sub v24.4s, v9.4s, v11.4s // ....................................*.......................................|...................................*....................................... + // add v9.4s, v9.4s, v11.4s // .......................................*....................................|......................................*.................................... + // mul v11.4s, v24.4s, v1.s[0] // .............................................*..............................|............................................*.............................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .....................................................*......................|....................................................*...................... + // mls v11.4s, v24.4s, v29.4s // .............................................................*..............|............................................................*.............. + // srshr v24.4S, v8.4S, #23 // ............................................*...............................|...........................................*............................... + // mls v8.4s, v24.4s, v29.4s // ............................................................*...............|...........................................................*............... + // srshr v24.4S, v9.4S, #23 // ..........................................*.................................|.........................................*................................. + // mls v9.4s, v24.4s, v29.4s // ................................................*...........................|...............................................*........................... + // str q8, [x0], #(16*4) // ..................................................................*.........|.................................................................*......... + // str q9, [x0, #(-16*4 + 1*16)] // ......................................................................*.....|.....................................................................*..... + // str q10, [x0, #(-16*4 + 2*16)] // ...........................................................................*|..........................................................................* + // str q11, [x0, #(-16*4 + 3*16)] // .........................................................................*..|........................................................................*.. + + sub count, count, #1 + cbnz count, layer5678_start + sub v23.4S, v22.4S, v5.4S // .*......................... + sqrdmulh v21.4S, v25.4S, v17.S[1] // ..*........................ + // gap // ........................... + // gap // ........................... + add v3.4S, v31.4S, v18.4S // *.......................... + mul v11.4S, v25.4S, v17.S[0] // ....*...................... + // gap // ........................... + // gap // ........................... + sqrdmulh v7.4S, v23.4S, v17.S[3] // .....*..................... + mul v8.4S, v23.4S, v17.S[2] // ......*.................... + // gap // ........................... + // gap // ........................... + add v23.4S, v22.4S, v5.4S // ...*....................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v11.4S, v21.4S, v29.4S // .........*................. + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v8.4S, v7.4S, v29.4S // ........*.................. + add v10.4S, v3.4S, v23.4S // .......*................... + // gap // ........................... + // gap // ........................... + sub v12.4S, v3.4S, v23.4S // ...........*............... + // gap // ........................... + // gap // ........................... + // gap // ........................... + srshr v23.4S, v10.4S, #23 // ...............*........... + // gap // ........................... + // gap // ........................... + // gap // ........................... + sub v18.4S, v11.4S, v8.4S // ..........*................ + mul v21.4S, v12.4S, v30.S[0] // ...................*....... + // gap // ........................... + // gap // ........................... + add v3.4S, v11.4S, v8.4S // ............*.............. + sqrdmulh v11.4S, v12.4S, v30.S[1] // ..............*............ + // gap // ........................... + // gap // ........................... + sqrdmulh v2.4S, v18.4S, v30.S[1] // ..................*........ + mul v18.4S, v18.4S, v30.S[0] // ................*.......... + // gap // ........................... + // gap // ........................... + mls v10.4S, v23.4S, v29.4S // ....................*...... + srshr v30.4S, v3.4S, #23 // .............*............. + // gap // ........................... + // gap // ........................... + mls v21.4S, v11.4S, v29.4S // .......................*... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v18.4S, v2.4S, v29.4S // .....................*..... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v3.4S, v30.4S, v29.4S // .................*......... + str q10, [x0], #(16*4) // ......................*.... + // gap // ........................... + // gap // ........................... + str q21, [x0, #-32] // ..........................* + // gap // ........................... + // gap // ........................... + // gap // ........................... + str q18, [x0, #-16] // .........................*. + // gap // ........................... + // gap // ........................... + // gap // ........................... + str q3, [x0, #-48] // ........................*.. + // gap // ........................... + // gap // ........................... + // gap // ........................... + + // original source code + // add v10.4S, v31.4S, v18.4S // ..*........................ + // sub v0.4S, v22.4S, v5.4S // *.......................... + // sqrdmulh v2.4S, v25.4S, v17.S[1] // .*......................... + // add v5.4S, v22.4S, v5.4S // ......*.................... + // mul v8.4S, v25.4S, v17.S[0] // ...*....................... + // sqrdmulh v19.4S, v0.4S, v17.S[3] // ....*...................... + // mul v9.4S, v0.4S, v17.S[2] // .....*..................... + // add v0.4S, v10.4S, v5.4S // .........*................. + // mls v9.4S, v19.4S, v29.4S // ........*.................. + // mls v8.4S, v2.4S, v29.4S // .......*................... + // sub v2.4S, v8.4S, v9.4S // ............*.............. + // sub v15.4S, v10.4S, v5.4S // ..........*................ + // add v12.4S, v8.4S, v9.4S // ..............*............ + // srshr v9.4S, v12.4S, #23 // ...................*....... + // sqrdmulh v27.4S, v15.4S, v30.S[1] // ...............*........... + // srshr v18.4S, v0.4S, #23 // ...........*............... + // mul v13.4S, v2.4S, v30.S[0] // .................*......... + // mls v12.4S, v9.4S, v29.4S // ......................*.... + // sqrdmulh v2.4S, v2.4S, v30.S[1] // ................*.......... + // mul v19.4S, v15.4S, v30.S[0] // .............*............. + // mls v0.4S, v18.4S, v29.4S // ..................*........ + // mls v13.4S, v2.4S, v29.4S // .....................*..... + // str q0, [x0], #(16*4) // .......................*... + // mls v19.4S, v27.4S, v29.4S // ....................*...... + // str q12, [x0, #-48] // ..........................* + // str q13, [x0, #-16] // .........................*. + // str q19, [x0, #-32] // ........................*.. + + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 + ldr q9, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + ldr q20, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + ldr q10, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + ldr q11, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + ldr q15, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + ldr q19, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q16, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + sub v14.4S, v20.4S, v9.4S // ................*....................................................................................................................................................................................................................................................................... + add v13.4S, v20.4S, v9.4S // .................*...................................................................................................................................................................................................................................................................... + ldr q12, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + sub v22.4S, v10.4S, v11.4S // .....................*.................................................................................................................................................................................................................................................................. + ldr q28, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + ldr q24, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + mul v27.4S, v14.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v14.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... + sqrdmulh v9.4S, v22.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... + mul v8.4S, v22.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ + ldr q23, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + add v17.4S, v12.4S, v16.4S // ...........................*............................................................................................................................................................................................................................................................ + sub v21.4S, v12.4S, v16.4S // ..........................*............................................................................................................................................................................................................................................................. + ldr q16, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + mls v27.4S, v14.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + sub v20.4S, v19.4S, v28.4S // ...............................*........................................................................................................................................................................................................................................................ + ldr q18, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + mls v8.4S, v9.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + mul v9.4S, v21.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... + ldr q22, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + mul v12.4S, v20.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v21.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... + sqrdmulh v21.4S, v20.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... + add v20.4S, v15.4S, v24.4S // ...............................................*........................................................................................................................................................................................................................................ + add v10.4S, v10.4S, v11.4S // ......................*................................................................................................................................................................................................................................................................. + sub v24.4S, v15.4S, v24.4S // ..............................................*......................................................................................................................................................................................................................................... + add v15.4S, v27.4S, v8.4S // ..............................................................*......................................................................................................................................................................................................................... + mls v12.4S, v21.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + mls v9.4S, v14.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + mul v21.4S, v24.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v24.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... + sub v24.4S, v13.4S, v10.4S // ........................................................*............................................................................................................................................................................................................................... + sub v11.4S, v27.4S, v8.4S // .............................................................*.......................................................................................................................................................................................................................... + add v27.4S, v9.4S, v12.4S // ........................................................................*............................................................................................................................................................................................................... + sub v12.4S, v9.4S, v12.4S // .......................................................................*................................................................................................................................................................................................................ + mls v21.4S, v14.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + sub v14.4S, v23.4S, v16.4S // ....................................*................................................................................................................................................................................................................................................... + add v8.4S, v13.4S, v10.4S // .........................................................*.............................................................................................................................................................................................................................. + sub v13.4S, v15.4S, v27.4S // .....................................................................................................*.................................................................................................................................................................................. + add v9.4S, v15.4S, v27.4S // ......................................................................................................*................................................................................................................................................................................. + mul v27.4S, v14.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. + mul v15.4S, v24.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. + sqrdmulh v10.4S, v14.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + sqrdmulh v14.4S, v24.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ + add v24.4S, v22.4S, v18.4S // ..........................................*............................................................................................................................................................................................................................................. + add v28.4S, v19.4S, v28.4S // ................................*....................................................................................................................................................................................................................................................... + sqrdmulh v19.4S, v11.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... + mls v27.4S, v10.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + sub v22.4S, v22.4S, v18.4S // .........................................*.............................................................................................................................................................................................................................................. + mls v15.4S, v14.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + sub v14.4S, v17.4S, v28.4S // ..................................................................*..................................................................................................................................................................................................................... + sqrdmulh v18.4S, v22.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... + mul v10.4S, v22.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ + sqrdmulh v22.4S, v14.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. + mul v14.4S, v14.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... + add v23.4S, v23.4S, v16.4S // .....................................*.................................................................................................................................................................................................................................................. + add v28.4S, v17.4S, v28.4S // ...................................................................*.................................................................................................................................................................................................................... + mls v10.4S, v18.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + sqrdmulh v18.4S, v12.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. + mls v14.4S, v22.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + mul v22.4S, v11.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ + mul v11.4S, v12.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. + add v16.4S, v23.4S, v24.4S // .............................................................................*.......................................................................................................................................................................................................... + sub v12.4S, v27.4S, v10.4S // .................................................................................*...................................................................................................................................................................................................... + add v17.4S, v27.4S, v10.4S // ..................................................................................*..................................................................................................................................................................................................... + mls v22.4S, v19.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + add v10.4S, v15.4S, v14.4S // ...........................................................................................................*............................................................................................................................................................................ + mul v19.4S, v12.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + sqrdmulh v27.4S, v12.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + sub v12.4S, v8.4S, v28.4S // ................................................................................................*....................................................................................................................................................................................... + mls v11.4S, v18.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + add v8.4S, v8.4S, v28.4S // .................................................................................................*...................................................................................................................................................................................... + sub v28.4S, v15.4S, v14.4S // ..........................................................................................................*............................................................................................................................................................................. + ldr q18, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + ldr q14, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + mls v19.4S, v27.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + sub v15.4S, v23.4S, v24.4S // ............................................................................*........................................................................................................................................................................................................... + sqrdmulh v27.4S, v12.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... + sub v23.4S, v14.4S, v18.4S // ...................................................*.................................................................................................................................................................................................................................... + mul v12.4S, v12.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... + add v24.4S, v14.4S, v18.4S // ....................................................*................................................................................................................................................................................................................................... + sqrdmulh v18.4S, v23.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. + mul v23.4S, v23.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. + sqrdmulh v14.4S, v13.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... + mul v13.4S, v13.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ + mls v12.4S, v27.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + add v27.4S, v20.4S, v24.4S // .......................................................................................*................................................................................................................................................................................................ + sub v24.4S, v20.4S, v24.4S // ......................................................................................*................................................................................................................................................................................................. + mls v23.4S, v18.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + mls v13.4S, v14.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + mul v18.4S, v15.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... + sqrdmulh v15.4S, v15.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + sub v14.4S, v16.4S, v27.4S // ....................................................................................................................*................................................................................................................................................................... + sub v20.4S, v21.4S, v23.4S // ...........................................................................................*............................................................................................................................................................................................ + add v16.4S, v16.4S, v27.4S // .....................................................................................................................*.................................................................................................................................................................. + add v23.4S, v21.4S, v23.4S // ............................................................................................*........................................................................................................................................................................................... + mul v27.4S, v24.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... + mul v21.4S, v20.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... + sqrdmulh v20.4S, v20.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... + mls v18.4S, v15.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + add v15.4S, v17.4S, v23.4S // ..........................................................................................................................*............................................................................................................................................................. + sub v17.4S, v17.4S, v23.4S // .........................................................................................................................*.............................................................................................................................................................. + sqrdmulh v23.4S, v24.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. + mls v21.4S, v20.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + sub v24.4S, v22.4S, v11.4S // ...............................................................................................................*........................................................................................................................................................................ + mls v27.4S, v23.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + add v20.4S, v19.4S, v21.4S // ....................................................................................................................................*................................................................................................................................................... + sub v23.4S, v9.4S, v15.4S // .............................................................................................................................................*.......................................................................................................................................... + add v9.4S, v9.4S, v15.4S // ..............................................................................................................................................*......................................................................................................................................... + mul v15.4S, v14.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. + sub v21.4S, v19.4S, v21.4S // ...................................................................................................................................*.................................................................................................................................................... + sqrdmulh v19.4S, v14.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ + sub v14.4S, v18.4S, v27.4S // ..............................................................................................................................*......................................................................................................................................................... + add v18.4S, v18.4S, v27.4S // ...............................................................................................................................*........................................................................................................................................................ + add v22.4S, v22.4S, v11.4S // ................................................................................................................*....................................................................................................................................................................... + mul v27.4S, v14.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + sqrdmulh v14.4S, v14.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + mls v15.4S, v19.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + add v11.4S, v22.4S, v20.4S // ........................................................................................................................................................*............................................................................................................................... + sub v19.4S, v22.4S, v20.4S // .......................................................................................................................................................*................................................................................................................................ + mul v20.4S, v17.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ + sqrdmulh v17.4S, v17.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + mls v27.4S, v14.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + sub v22.4S, v12.4S, v15.4S // ............................................................................................................................................................*........................................................................................................................... + sqrdmulh v14.4S, v23.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + add v12.4S, v12.4S, v15.4S // .............................................................................................................................................................*.......................................................................................................................... + sub v15.4S, v10.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... + mls v20.4S, v17.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + mul v17.4S, v23.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + add v10.4S, v10.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... + sqrdmulh v23.4S, v15.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + mul v18.4S, v15.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + mul v15.4S, v24.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + sub count, count, #1 +layer1234_start: + sqrdmulh v24.4S, v24.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + mls v18.4S, v23.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + sqrdmulh v23.4S, v22.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + mul v22.4S, v22.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + mls v17.4S, v14.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + mul v14.4S, v28.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + sqrdmulh v28.4S, v28.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + mls v15.4S, v24.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + mls v22.4S, v23.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + mul v23.4S, v21.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + sqrdmulh v24.4S, v21.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + sub v21.4S, v8.4S, v16.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v16.4S // .........................................................................................................................................*.............................................................................................................................................. + mls v14.4S, v28.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + cmge v28.4S, v31.4S, v22.4S // ................................................................................................................................................................................................*....................................................................................... + cmge v16.4S, v22.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v23.4S, v24.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + sqrdmulh v24.4S, v21.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + sub v28.4S, v28.4S, v16.4S // ..................................................................................................................................................................................................*..................................................................................... + mul v16.4S, v21.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + add v21.4S, v14.4S, v27.4S // .......................................................................................................................................................................*................................................................................................................ + mls v22.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + cmge v28.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + sub v27.4S, v14.4S, v27.4S // ......................................................................................................................................................................*................................................................................................................. + cmge v14.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + mls v16.4S, v24.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + cmge v24.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + str q22, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + sub v22.4S, v14.4S, v28.4S // ......................................................................................................................................................................................*................................................................................................. + cmge v14.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + add v28.4S, v15.4S, v23.4S // ............................................................................................................................................................................*........................................................................................................... + sub v15.4S, v15.4S, v23.4S // ...........................................................................................................................................................................*............................................................................................................ + mls v17.4S, v22.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sub v24.4S, v24.4S, v14.4S // ..........................................................................................................................................................................................*............................................................................................. + sub v14.4S, v13.4S, v20.4S // .................................................................................................................................................................*...................................................................................................................... + add v23.4S, v13.4S, v20.4S // ..................................................................................................................................................................*..................................................................................................................... + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + cmge v22.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + mul v13.4S, v14.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + sqrdmulh v12.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + mul v17.4S, v23.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + sqrdmulh v23.4S, v23.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + mls v18.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + sqrdmulh v14.4S, v14.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + sqrdmulh v24.4S, v27.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mul v27.4S, v27.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + mls v17.4S, v23.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v23.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + mls v20.4S, v12.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + mls v13.4S, v14.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + mls v27.4S, v24.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + sub v12.4S, v22.4S, v23.4S // ..................................................................................................................................................................................*..................................................................................................... + mul v22.4S, v21.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + sqrdmulh v18.4S, v21.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + sqrdmulh v23.4S, v15.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + mul v21.4S, v15.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + cmge v24.4S, v27.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v14.4S, v31.4S, v27.4S // ........................................................................................................................................................................................................*............................................................................... + mls v16.4S, v12.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + mul v12.4S, v28.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sub v15.4S, v14.4S, v24.4S // ..........................................................................................................................................................................................................*............................................................................. + mls v21.4S, v23.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + mul v24.4S, v19.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v28.4S, v28.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + sqrdmulh v23.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + sqrdmulh v16.4S, v19.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + sqrdmulh v19.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v27.4S, v15.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mls v12.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + mul v15.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + mls v24.4S, v16.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + mul v9.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + cmge v28.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................................*................... + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + cmge v11.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + mls v15.4S, v19.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + mls v9.4S, v23.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + cmge v14.4S, v13.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + sub v19.4S, v28.4S, v11.4S // ......................................................................................................................................................................................................................................................................*................. + cmge v28.4S, v31.4S, v13.4S // ....................................................................................................................................................................................................*................................................................................... + cmge v23.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................*........................... + sub v14.4S, v28.4S, v14.4S // ......................................................................................................................................................................................................*................................................................................. + str q27, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + cmge v27.4S, v21.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + cmge v11.4S, v12.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + sqrdmulh v8.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + cmge v28.4S, v31.4S, v12.4S // ............................................................................................................................................................................................................................................................................*........... + mls v13.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + mls v17.4S, v19.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + cmge v14.4S, v9.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + sub v19.4S, v28.4S, v11.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v16.4S, v8.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + str q13, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + mls v22.4S, v18.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + str q17, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + cmge v28.4S, v31.4S, v24.4S // ............................................................................................................................................................................................*........................................................................................... + cmge v17.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + cmge v8.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + cmge v11.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + cmge v13.4S, v31.4S, v21.4S // ............................................................................................................................................................................................................*........................................................................... + sub v17.4S, v23.4S, v17.4S // ..............................................................................................................................................................................................................................................................*......................... + sub v8.4S, v11.4S, v8.4S // ..................................................................................................................................................................................................................................................*..................................... + cmge v11.4S, v31.4S, v9.4S // ....................................................................................................................................................................................................................................................*................................... + mls v15.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + cmge v23.4S, v24.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + mls v16.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + sub v8.4S, v11.4S, v14.4S // ......................................................................................................................................................................................................................................................*................................. + sub v17.4S, v13.4S, v27.4S // ..............................................................................................................................................................................................................*......................................................................... + cmge v27.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + ldr q14, [x1, #80] // .e...................................................................................................................................................................................................................................................................................... + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + cmge v16.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + ldr q11, [x1, #0] // e....................................................................................................................................................................................................................................................................................... + cmge v13.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + str q15, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + cmge v15.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + sub v27.4S, v27.4S, v16.4S // ..................................................................................................................................................................................................................................................................*..................... + sqrdmulh v16.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + sub v10.4S, v28.4S, v23.4S // ..............................................................................................................................................................................................*......................................................................................... + ldr q28, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. + mls v12.4S, v19.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + ldr q19, [x1, #448] // .......e................................................................................................................................................................................................................................................................................ + mls v20.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + mls v24.4S, v10.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + add v10.4S, v11.4S, v14.4S // .................e...................................................................................................................................................................................................................................................................... + mls v18.4S, v16.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + sub v11.4S, v11.4S, v14.4S // ................e....................................................................................................................................................................................................................................................................... + ldr q14, [x1, #320] // .....e.................................................................................................................................................................................................................................................................................. + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + mls v21.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + ldr q17, [x1, #256] // ....e................................................................................................................................................................................................................................................................................... + sub v13.4S, v13.4S, v15.4S // ..........................................................................................................................................................................................................................................................................*............. + sqrdmulh v23.4S, v11.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... + str q24, [x1, #688] // ...................................................................................................................................................................................................................*.................................................................... + ldr q24, [x1, #384] // ......e................................................................................................................................................................................................................................................................................. + mul v16.4S, v11.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... + sub v27.4S, v17.4S, v14.4S // ..........................e............................................................................................................................................................................................................................................................. + add v15.4S, v24.4S, v19.4S // ................................e....................................................................................................................................................................................................................................................... + ldr q11, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... + ldr q20, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... + mls v22.4S, v13.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + str q12, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + add v12.4S, v17.4S, v14.4S // ...........................e............................................................................................................................................................................................................................................................ + mul v13.4S, v27.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v27.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... + add v17.4S, v12.4S, v15.4S // ...................................................................e.................................................................................................................................................................................................................... + mls v9.4S, v8.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + str q21, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ + add v27.4S, v11.4S, v20.4S // ......................e................................................................................................................................................................................................................................................................. + sub v24.4S, v24.4S, v19.4S // ...............................e........................................................................................................................................................................................................................................................ + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + ldr q19, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ + mls v16.4S, v23.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... + sub v8.4S, v11.4S, v20.4S // .....................e.................................................................................................................................................................................................................................................................. + mls v13.4S, v14.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... + sub v14.4S, v12.4S, v15.4S // ..................................................................e..................................................................................................................................................................................................................... + ldr q15, [x1, #512] // ........e............................................................................................................................................................................................................................................................................... + sqrdmulh v22.4S, v8.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... + mul v21.4S, v8.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ + ldr q11, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... + add v23.4S, v10.4S, v27.4S // .........................................................e.............................................................................................................................................................................................................................. + sqrdmulh v12.4S, v24.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... + sub v10.4S, v10.4S, v27.4S // ........................................................e............................................................................................................................................................................................................................... + cmge v20.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + ldr q8, [x1, #576] // .........e.............................................................................................................................................................................................................................................................................. + mul v24.4S, v24.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... + str q9, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + cmge v27.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... + sub v9.4S, v28.4S, v19.4S // .........................................e.............................................................................................................................................................................................................................................. + mls v21.4S, v22.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. + sub v20.4S, v27.4S, v20.4S // ..........................................................................................................................................................................................................................................................*............................. + add v27.4S, v28.4S, v19.4S // ..........................................e............................................................................................................................................................................................................................................. + mul v19.4S, v9.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ + mls v24.4S, v12.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... + mls v18.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + sub v12.4S, v16.4S, v21.4S // .............................................................e.......................................................................................................................................................................................................................... + add v21.4S, v16.4S, v21.4S // ..............................................................e......................................................................................................................................................................................................................... + add v16.4S, v15.4S, v8.4S // .....................................e.................................................................................................................................................................................................................................................. + sub v22.4S, v13.4S, v24.4S // .......................................................................e................................................................................................................................................................................................................ + add v20.4S, v13.4S, v24.4S // ........................................................................e............................................................................................................................................................................................................... + str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + add v18.4S, v16.4S, v27.4S // .............................................................................e.......................................................................................................................................................................................................... + sqrdmulh v28.4S, v9.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... + sub v13.4S, v21.4S, v20.4S // .....................................................................................................e.................................................................................................................................................................................. + add v9.4S, v21.4S, v20.4S // ......................................................................................................e................................................................................................................................................................................. + mul v24.4S, v12.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ + mul v21.4S, v22.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. + mls v19.4S, v28.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... + sqrdmulh v20.4S, v12.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... + ldr q12, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... + sqrdmulh v28.4S, v13.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... + mul v13.4S, v13.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ + sub v27.4S, v16.4S, v27.4S // ............................................................................e........................................................................................................................................................................................................... + mls v13.4S, v28.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. + sqrdmulh v16.4S, v14.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. + mul v28.4S, v14.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... + mul v14.4S, v10.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. + sqrdmulh v10.4S, v10.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ + mls v24.4S, v20.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... + sub v15.4S, v15.4S, v8.4S // ....................................e................................................................................................................................................................................................................................................... + sqrdmulh v8.4S, v22.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. + mul v22.4S, v27.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... + mls v28.4S, v16.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. + sqrdmulh v20.4S, v27.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ + mls v14.4S, v10.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... + mul v27.4S, v15.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. + add v16.4S, v11.4S, v12.4S // ...............................................e........................................................................................................................................................................................................................................ + mls v21.4S, v8.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ + sqrdmulh v8.4S, v15.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ + sub v15.4S, v11.4S, v12.4S // ..............................................e......................................................................................................................................................................................................................................... + mls v22.4S, v20.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... + add v10.4S, v14.4S, v28.4S // ...........................................................................................................e............................................................................................................................................................................ + add v11.4S, v24.4S, v21.4S // ................................................................................................................e....................................................................................................................................................................... + sub v28.4S, v14.4S, v28.4S // ..........................................................................................................e............................................................................................................................................................................. + mls v27.4S, v8.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... + ldr q14, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... + sub v24.4S, v24.4S, v21.4S // ...............................................................................................................e........................................................................................................................................................................ + ldr q20, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ + mul v21.4S, v15.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... + sqrdmulh v15.4S, v15.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... + mls v21.4S, v15.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... + sub v8.4S, v27.4S, v19.4S // .................................................................................e...................................................................................................................................................................................................... + add v19.4S, v27.4S, v19.4S // ..................................................................................e..................................................................................................................................................................................................... + add v12.4S, v14.4S, v20.4S // ....................................................e................................................................................................................................................................................................................................... + sqrdmulh v27.4S, v8.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... + mul v15.4S, v8.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... + sub v20.4S, v14.4S, v20.4S // ...................................................e.................................................................................................................................................................................................................................... + sub v14.4S, v23.4S, v17.4S // ................................................................................................e....................................................................................................................................................................................... + add v8.4S, v23.4S, v17.4S // .................................................................................................e...................................................................................................................................................................................... + sub v17.4S, v16.4S, v12.4S // ......................................................................................e................................................................................................................................................................................................. + add v12.4S, v16.4S, v12.4S // .......................................................................................e................................................................................................................................................................................................ + mls v15.4S, v27.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. + mul v23.4S, v17.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... + sqrdmulh v17.4S, v17.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. + mul v27.4S, v20.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. + add v16.4S, v18.4S, v12.4S // .....................................................................................................................e.................................................................................................................................................................. + sub v12.4S, v18.4S, v12.4S // ....................................................................................................................e................................................................................................................................................................... + sqrdmulh v18.4S, v20.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. + mls v23.4S, v17.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. + sqrdmulh v17.4S, v14.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... + mul v20.4S, v14.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... + sqrdmulh v14.4S, v12.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ + mls v27.4S, v18.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ + mul v12.4S, v12.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. + add v18.4S, v22.4S, v23.4S // ...............................................................................................................................e........................................................................................................................................................ + sub v22.4S, v22.4S, v23.4S // ..............................................................................................................................e......................................................................................................................................................... + mls v20.4S, v17.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... + sub v23.4S, v10.4S, v18.4S // ..................................................................................................................................................e..................................................................................................................................... + add v10.4S, v10.4S, v18.4S // ...................................................................................................................................................e.................................................................................................................................... + sub v17.4S, v21.4S, v27.4S // ...........................................................................................e............................................................................................................................................................................................ + add v27.4S, v21.4S, v27.4S // ............................................................................................e........................................................................................................................................................................................... + sqrdmulh v21.4S, v22.4S, v1.S[1] // .................................................................................................................................e...................................................................................................................................................... + mls v12.4S, v14.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... + sqrdmulh v14.4S, v17.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... + mul v18.4S, v17.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... + add v17.4S, v19.4S, v27.4S // ..........................................................................................................................e............................................................................................................................................................. + sub v19.4S, v19.4S, v27.4S // .........................................................................................................................e.............................................................................................................................................................. + mul v27.4S, v22.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... + sub v22.4S, v20.4S, v12.4S // ............................................................................................................................................................e........................................................................................................................... + add v12.4S, v20.4S, v12.4S // .............................................................................................................................................................e.......................................................................................................................... + mls v18.4S, v14.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ + sub v14.4S, v9.4S, v17.4S // .............................................................................................................................................e.......................................................................................................................................... + add v9.4S, v9.4S, v17.4S // ..............................................................................................................................................e......................................................................................................................................... + sqrdmulh v17.4S, v19.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... + mul v20.4S, v19.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + mls v27.4S, v21.4S, v29.4S // ..................................................................................................................................e..................................................................................................................................................... + sub v21.4S, v15.4S, v18.4S // ...................................................................................................................................e.................................................................................................................................................... + add v15.4S, v15.4S, v18.4S // ....................................................................................................................................e................................................................................................................................................... + mul v18.4S, v23.4S, v0.S[0] // ....................................................................................................................................................e................................................................................................................................... + sqrdmulh v23.4S, v23.4S, v0.S[1] // .....................................................................................................................................................e.................................................................................................................................. + sub v19.4S, v11.4S, v15.4S // .......................................................................................................................................................e................................................................................................................................ + add v11.4S, v11.4S, v15.4S // ........................................................................................................................................................e............................................................................................................................... + mls v20.4S, v17.4S, v29.4S // .............................................................................................................................e.......................................................................................................................................................... + mul v15.4S, v24.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... + mul v17.4S, v14.4S, v0.S[0] // ...............................................................................................................................................e........................................................................................................................................ + sqrdmulh v14.4S, v14.4S, v0.S[1] // ................................................................................................................................................e....................................................................................................................................... + + // original source code + // ldr q8, [x1, #0] // ...e...................................................................................................................................................................|...................................................................................................................e.................................................................... + // ldr q9, [x1, #(1*(512/8))] // e......................................................................................................................................................................|................................................................................................................e....................................................................... + // ldr q10, [x1, #(2*(512/8))] // .............................e.........................................................................................................................................|.............................................................................................................................................e.......................................... + // ldr q11, [x1, #(3*(512/8))] // ..............................e........................................................................................................................................|..............................................................................................................................................e......................................... + // ldr q12, [x1, #(4*(512/8))] // .....................e.................................................................................................................................................|.....................................................................................................................................e.................................................. + // ldr q13, [x1, #(5*(512/8))] // ..................e....................................................................................................................................................|..................................................................................................................................e..................................................... + // ldr q14, [x1, #(6*(512/8))] // .........................e.............................................................................................................................................|.........................................................................................................................................e.............................................. + // ldr q15, [x1, #(7*(512/8))] // ............e..........................................................................................................................................................|............................................................................................................................e........................................................... + // ldr q16, [x1, #(8*(512/8))] // ...............................................e.......................................................................................................................|...............................................................................................................................................................e........................ + // ldr q17, [x1, #(9*(512/8))] // .......................................................e...............................................................................................................|.......................................................................................................................................................................e................ + // ldr q18, [x1, #(10*(512/8))] // ..........e............................................................................................................................................................|..........................................................................................................................e............................................................. + // ldr q19, [x1, #(11*(512/8))] // ..........................................e............................................................................................................................|..........................................................................................................................................................e............................. + // ldr q20, [x1, #(12*(512/8))] // ..................................................e....................................................................................................................|..................................................................................................................................................................e..................... + // ldr q21, [x1, #(13*(512/8))] // ................................................................................e......................................................................................|........................................................................................................................................................................................ + // ldr q22, [x1, #(14*(512/8))] // ..........................................................................................................e............................................................|........................................................................................................................................................................................ + // ldr q23, [x1, #(15*(512/8))] // ............................................................................................................e..........................................................|........................................................................................................................................................................................ + // sub v24.4s, v8.4s, v9.4s // .................e.....................................................................................................................................................|.................................................................................................................................e...................................................... + // add v8.4s, v8.4s, v9.4s // ...............e.......................................................................................................................................................|...............................................................................................................................e........................................................ + // mul v9.4s, v24.4s, v3.s[2] // ..........................e............................................................................................................................................|..........................................................................................................................................e............................................. + // sqrdmulh v24.4s, v24.4s, v3.s[3] // .......................e...............................................................................................................................................|.......................................................................................................................................e................................................ + // mls v9.4s, v24.4s, v29.4s // ...........................................e...........................................................................................................................|...........................................................................................................................................................e............................ + // sub v24.4s, v10.4s, v11.4s // ............................................e..........................................................................................................................|............................................................................................................................................................e........................... + // add v10.4s, v10.4s, v11.4s // .......................................e...............................................................................................................................|.......................................................................................................................................................e................................ + // mul v11.4s, v24.4s, v4.s[0] // .................................................e.....................................................................................................................|.................................................................................................................................................................e...................... + // sqrdmulh v24.4s, v24.4s, v4.s[1] // ................................................e......................................................................................................................|................................................................................................................................................................e....................... + // mls v11.4s, v24.4s, v29.4s // ............................................................e..........................................................................................................|............................................................................................................................................................................e........... + // sub v24.4s, v12.4s, v13.4s // ...........................e...........................................................................................................................................|...........................................................................................................................................e............................................ + // add v12.4s, v12.4s, v13.4s // .................................e.....................................................................................................................................|.................................................................................................................................................e...................................... + // mul v13.4s, v24.4s, v4.s[2] // ..................................e....................................................................................................................................|..................................................................................................................................................e..................................... + // sqrdmulh v24.4s, v24.4s, v4.s[3] // ...................................e...................................................................................................................................|...................................................................................................................................................e.................................... + // mls v13.4s, v24.4s, v29.4s // .............................................e.........................................................................................................................|.............................................................................................................................................................e.......................... + // sub v24.4s, v14.4s, v15.4s // ........................................e..............................................................................................................................|........................................................................................................................................................e............................... + // add v14.4s, v14.4s, v15.4s // ............................e..........................................................................................................................................|............................................................................................................................................e........................................... + // mul v15.4s, v24.4s, v5.s[0] // ........................................................e..............................................................................................................|........................................................................................................................................................................e............... + // sqrdmulh v24.4s, v24.4s, v5.s[1] // ....................................................e..................................................................................................................|....................................................................................................................................................................e................... + // mls v15.4s, v24.4s, v29.4s // ................................................................e......................................................................................................|................................................................................................................................................................................e....... + // sub v24.4s, v16.4s, v17.4s // ..........................................................................................e............................................................................|........................................................................................................................................................................................ + // add v16.4s, v16.4s, v17.4s // ....................................................................e..................................................................................................|....................................................................................................................................................................................e... + // mul v17.4s, v24.4s, v5.s[2] // ................................................................................................e......................................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v5.s[3] // ...................................................................................................e...................................................................|........................................................................................................................................................................................ + // mls v17.4s, v24.4s, v29.4s // .........................................................................................................e.............................................................|........................................................................................................................................................................................ + // sub v24.4s, v18.4s, v19.4s // ...........................................................e...........................................................................................................|...........................................................................................................................................................................e............ + // add v18.4s, v18.4s, v19.4s // ..............................................................e........................................................................................................|..............................................................................................................................................................................e......... + // mul v19.4s, v24.4s, v6.s[0] // ...............................................................e.......................................................................................................|...............................................................................................................................................................................e........ + // sqrdmulh v24.4s, v24.4s, v6.s[1] // .........................................................................e.............................................................................................|........................................................................................................................................................................................ + // mls v19.4s, v24.4s, v29.4s // ..............................................................................e........................................................................................|........................................................................................................................................................................................ + // sub v24.4s, v20.4s, v21.4s // ....................................................................................................e..................................................................|........................................................................................................................................................................................ + // add v20.4s, v20.4s, v21.4s // .................................................................................................e.....................................................................|........................................................................................................................................................................................ + // mul v21.4s, v24.4s, v6.s[2] // .............................................................................................................e.........................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v6.s[3] // ..............................................................................................................e........................................................|........................................................................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // ...............................................................................................................e.......................................................|........................................................................................................................................................................................ + // sub v24.4s, v22.4s, v23.4s // .....................................................................................................................e.................................................|........................................................................................................................................................................................ + // add v22.4s, v22.4s, v23.4s // ..................................................................................................................e....................................................|........................................................................................................................................................................................ + // mul v23.4s, v24.4s, v7.s[0] // .............................................................................................................................e.........................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v7.s[1] // ................................................................................................................................e......................................|........................................................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // .....................................................................................................................................e.................................|........................................................................................................................................................................................ + // sub v24.4s, v8.4s, v10.4s // .....................................................e.................................................................................................................|.....................................................................................................................................................................e.................. + // add v8.4s, v8.4s, v10.4s // ...................................................e...................................................................................................................|...................................................................................................................................................................e.................... + // mul v10.4s, v24.4s, v1.s[2] // .......................................................................................e...............................................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ........................................................................................e..............................................................................|........................................................................................................................................................................................ + // mls v10.4s, v24.4s, v29.4s // ...............................................................................................e.......................................................................|........................................................................................................................................................................................ + // sub v24.4s, v9.4s, v11.4s // ..................................................................e....................................................................................................|..................................................................................................................................................................................e..... + // add v9.4s, v9.4s, v11.4s // ...................................................................e...................................................................................................|...................................................................................................................................................................................e.... + // mul v11.4s, v24.4s, v1.s[2] // ............................................................................e..........................................................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...............................................................................e.......................................................................................|........................................................................................................................................................................................ + // mls v11.4s, v24.4s, v29.4s // .........................................................................................e.............................................................................|........................................................................................................................................................................................ + // sub v24.4s, v12.4s, v14.4s // ..............................................e........................................................................................................................|..............................................................................................................................................................e......................... + // add v12.4s, v12.4s, v14.4s // ....................................e..................................................................................................................................|....................................................................................................................................................e................................... + // mul v14.4s, v24.4s, v2.s[0] // ......................................................................................e................................................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .....................................................................................e.................................................................................|........................................................................................................................................................................................ + // mls v14.4s, v24.4s, v29.4s // .............................................................................................e.........................................................................|........................................................................................................................................................................................ + // sub v24.4s, v13.4s, v15.4s // .....................................................................e.................................................................................................|.....................................................................................................................................................................................e.. + // add v13.4s, v13.4s, v15.4s // ......................................................................e................................................................................................|......................................................................................................................................................................................e. + // mul v15.4s, v24.4s, v2.s[0] // .............................................................................e.........................................................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...........................................................................................e...........................................................................|........................................................................................................................................................................................ + // mls v15.4s, v24.4s, v29.4s // ..................................................................................................e....................................................................|........................................................................................................................................................................................ + // sub v24.4s, v16.4s, v18.4s // ...................................................................................e...................................................................................|........................................................................................................................................................................................ + // add v16.4s, v16.4s, v18.4s // ........................................................................e..............................................................................................|........................................................................................................................................................................................ + // mul v18.4s, v24.4s, v2.s[2] // ............................................................................................e..........................................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..............................................................................................e........................................................................|........................................................................................................................................................................................ + // mls v18.4s, v24.4s, v29.4s // .....................................................................................................e.................................................................|........................................................................................................................................................................................ + // sub v24.4s, v17.4s, v19.4s // ................................................................................................................e......................................................|........................................................................................................................................................................................ + // add v17.4s, v17.4s, v19.4s // .................................................................................................................e.....................................................|........................................................................................................................................................................................ + // mul v19.4s, v24.4s, v2.s[2] // ....................................................................................................................e..................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...................................................................................................................e...................................................|........................................................................................................................................................................................ + // mls v19.4s, v24.4s, v29.4s // ..........................................................................................................................e............................................|........................................................................................................................................................................................ + // sub v24.4s, v20.4s, v22.4s // ........................................................................................................................e..............................................|........................................................................................................................................................................................ + // add v20.4s, v20.4s, v22.4s // .........................................................................................................................e.............................................|........................................................................................................................................................................................ + // mul v22.4s, v24.4s, v3.s[0] // ...........................................................................................................................e...........................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ............................................................................................................................e..........................................|........................................................................................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // .................................................................................................................................e.....................................|........................................................................................................................................................................................ + // sub v24.4s, v21.4s, v23.4s // ............................................................................................................................................e..........................|........................................................................................................................................................................................ + // add v21.4s, v21.4s, v23.4s // .............................................................................................................................................e.........................|........................................................................................................................................................................................ + // mul v23.4s, v24.4s, v3.s[0] // .................................................................................................................................................e.....................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................................................................................................................................................e......................|........................................................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // .......................................................................................................................................................e...............|........................................................................................................................................................................................ + // sub v24.4s, v8.4s, v12.4s // ......................................................................................................................e................................................|........................................................................................................................................................................................ + // add v8.4s, v8.4s, v12.4s // .......................................................................................................................e...............................................|........................................................................................................................................................................................ + // mul v12.4s, v24.4s, v0.s[2] // ...................................................................................................................................e...................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................................................................................................................e....................................|........................................................................................................................................................................................ + // mls v12.4s, v24.4s, v29.4s // .........................................................................................................................................e.............................|........................................................................................................................................................................................ + // sub v24.4s, v9.4s, v13.4s // ..........................................................................e............................................................................................|........................................................................................................................................................................................ + // add v9.4s, v9.4s, v13.4s // ...........................................................................e...........................................................................................|........................................................................................................................................................................................ + // mul v13.4s, v24.4s, v0.s[2] // ..................................................................................e....................................................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .................................................................................e.....................................................................................|........................................................................................................................................................................................ + // mls v13.4s, v24.4s, v29.4s // ....................................................................................e..................................................................................|........................................................................................................................................................................................ + // sub v24.4s, v10.4s, v14.4s // ........................................................................................................e..............................................................|........................................................................................................................................................................................ + // add v10.4s, v10.4s, v14.4s // ......................................................................................................e................................................................|........................................................................................................................................................................................ + // mul v14.4s, v24.4s, v0.s[2] // .......................................................................................................................................................................|....*................................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................................................................|.....*.................................................................................................................................................................................. + // mls v14.4s, v24.4s, v29.4s // .......................................................................................................................................................................|............*........................................................................................................................................................................... + // sub v24.4s, v11.4s, v15.4s // ...........................................................................................................e...........................................................|........................................................................................................................................................................................ + // add v11.4s, v11.4s, v15.4s // .......................................................................................................e...............................................................|........................................................................................................................................................................................ + // mul v15.4s, v24.4s, v0.s[2] // ....................................................................................................................................................................e..|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................................................................*........................................................................................................................................................................................ + // mls v15.4s, v24.4s, v29.4s // .......................................................................................................................................................................|......*................................................................................................................................................................................. + // sub v24.4s, v16.4s, v20.4s // ...............................................................................................................................e.......................................|........................................................................................................................................................................................ + // add v16.4s, v16.4s, v20.4s // ..............................................................................................................................e........................................|........................................................................................................................................................................................ + // mul v20.4s, v24.4s, v1.s[0] // ......................................................................................................................................e................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ....................................................................................................................................e..................................|........................................................................................................................................................................................ + // mls v20.4s, v24.4s, v29.4s // ...............................................................................................................................................e.......................|........................................................................................................................................................................................ + // sub v24.4s, v17.4s, v21.4s // ...................................................................................................................................................e...................|........................................................................................................................................................................................ + // add v17.4s, v17.4s, v21.4s // ..................................................................................................................................................e....................|........................................................................................................................................................................................ + // mul v21.4s, v24.4s, v1.s[0] // ...........................................................................................................................................................e...........|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................................................................................................................................................e............|........................................................................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // ...................................................................................................................................................................e...|........................................................................................................................................................................................ + // sub v24.4s, v18.4s, v22.4s // ........................................................................................................................................e..............................|........................................................................................................................................................................................ + // add v18.4s, v18.4s, v22.4s // .......................................................................................................................................e...............................|........................................................................................................................................................................................ + // mul v22.4s, v24.4s, v1.s[0] // ....................................................................................................................................................e..................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................................................e........................|........................................................................................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................e..........|........................................................................................................................................................................................ + // sub v24.4s, v19.4s, v23.4s // .............................................................................................................................................................e.........|........................................................................................................................................................................................ + // add v19.4s, v19.4s, v23.4s // ..............................................................................................................................................................e........|........................................................................................................................................................................................ + // mul v23.4s, v24.4s, v1.s[0] // .......................................................................................................................................................................|........*............................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .......................................................................................................................................................................|.........*.............................................................................................................................................................................. + // mls v23.4s, v24.4s, v29.4s // .......................................................................................................................................................................|...............*........................................................................................................................................................................ + // sub v24.4s, v8.4s, v16.4s // .......................................................................................................................................................................|..........*............................................................................................................................................................................. + // add v8.4s, v8.4s, v16.4s // .......................................................................................................................................................................|...........*............................................................................................................................................................................ + // mul v16.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|..................*..................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|................*....................................................................................................................................................................... + // mls v16.4s, v24.4s, v29.4s // .......................................................................................................................................................................|........................*............................................................................................................................................................... + // sub v24.4s, v9.4s, v17.4s // ........................................................................................................................................................e..............|........................................................................................................................................................................................ + // add v9.4s, v9.4s, v17.4s // .........................................................................................................................................................e.............|........................................................................................................................................................................................ + // mul v17.4s, v24.4s, v0.s[0] // .....................................................................................................................................................................e.|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................................................................................................................e|........................................................................................................................................................................................ + // mls v17.4s, v24.4s, v29.4s // .......................................................................................................................................................................|...*.................................................................................................................................................................................... + // sub v24.4s, v10.4s, v18.4s // ..........................................................................................................................................e............................|........................................................................................................................................................................................ + // add v10.4s, v10.4s, v18.4s // ...........................................................................................................................................e...........................|........................................................................................................................................................................................ + // mul v18.4s, v24.4s, v0.s[0] // ...............................................................................................................................................................e.......|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................................................................................................................................e......|........................................................................................................................................................................................ + // mls v18.4s, v24.4s, v29.4s // .......................................................................................................................................................................|*....................................................................................................................................................................................... + // sub v24.4s, v11.4s, v19.4s // .................................................................................................................................................................e.....|........................................................................................................................................................................................ + // add v11.4s, v11.4s, v19.4s // ..................................................................................................................................................................e....|........................................................................................................................................................................................ + // mul v19.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|...............................................................*........................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|...................................................................*.................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // .......................................................................................................................................................................|........................................................................*............................................................................................................... + // sub v24.4s, v12.4s, v20.4s // .....................................................................................................................................................e.................|........................................................................................................................................................................................ + // add v12.4s, v12.4s, v20.4s // ......................................................................................................................................................e................|........................................................................................................................................................................................ + // mul v20.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|..*..................................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|.*...................................................................................................................................................................................... + // mls v20.4s, v24.4s, v29.4s // .......................................................................................................................................................................|.......*................................................................................................................................................................................ + // sub v24.4s, v13.4s, v21.4s // .......................................................................................................................................................................|.................................*...................................................................................................................................................... + // add v13.4s, v13.4s, v21.4s // .......................................................................................................................................................................|..................................*..................................................................................................................................................... + // mul v21.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|......................................*................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|...........................................*............................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // .......................................................................................................................................................................|..................................................*..................................................................................................................................... + // sub v24.4s, v14.4s, v22.4s // .......................................................................................................................................................................|......................*................................................................................................................................................................. + // add v14.4s, v14.4s, v22.4s // .......................................................................................................................................................................|...................*.................................................................................................................................................................... + // mul v22.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|.............................................*.......................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|............................................*........................................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // .......................................................................................................................................................................|...................................................*.................................................................................................................................... + // sub v24.4s, v15.4s, v23.4s // .......................................................................................................................................................................|..............................*......................................................................................................................................................... + // add v15.4s, v15.4s, v23.4s // .......................................................................................................................................................................|.............................*.......................................................................................................................................................... + // mul v23.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|........................................................*............................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|.......................................................*................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // .......................................................................................................................................................................|..............................................................*......................................................................................................................... + // cmge v27.4s, v31.4s, v16.4s // .......................................................................................................................................................................|....................................*................................................................................................................................................... + // cmge v28.4s, v16.4s, v30.4s // .......................................................................................................................................................................|...............................................*........................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|....................................................*................................................................................................................................... + // mls v16.4s, v28.4s, v29.4s // .......................................................................................................................................................................|...........................................................*............................................................................................................................ + // cmge v27.4s, v31.4s, v17.4s // .......................................................................................................................................................................|.......................*................................................................................................................................................................ + // cmge v28.4s, v17.4s, v30.4s // .......................................................................................................................................................................|.....................*.................................................................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|...........................*............................................................................................................................................................ + // mls v17.4s, v28.4s, v29.4s // .......................................................................................................................................................................|...............................*........................................................................................................................................................ + // cmge v27.4s, v31.4s, v18.4s // .......................................................................................................................................................................|.........................*.............................................................................................................................................................. + // cmge v28.4s, v18.4s, v30.4s // .......................................................................................................................................................................|............................*........................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|................................*....................................................................................................................................................... + // mls v18.4s, v28.4s, v29.4s // .......................................................................................................................................................................|..........................................*............................................................................................................................................. + // cmge v27.4s, v31.4s, v19.4s // .......................................................................................................................................................................|..................................................................................................*..................................................................................... + // cmge v28.4s, v19.4s, v30.4s // .......................................................................................................................................................................|...........................................................................................................*............................................................................ + // sub v28.4s, v27.4s, v28.4s // .........*.............................................................................................................................................................|.........................................................................................................................*.............................................................. + // mls v19.4s, v28.4s, v29.4s // ..............*........................................................................................................................................................|..............................................................................................................................*......................................................... + // cmge v27.4s, v31.4s, v20.4s // .......................................................................................................................................................................|.............*.......................................................................................................................................................................... + // cmge v28.4s, v20.4s, v30.4s // .......................................................................................................................................................................|..............*......................................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|.................*...................................................................................................................................................................... + // mls v20.4s, v28.4s, v29.4s // .......................................................................................................................................................................|....................*................................................................................................................................................................... + // cmge v27.4s, v31.4s, v21.4s // .......................................................................................................................................................................|.................................................................................*...................................................................................................... + // cmge v28.4s, v21.4s, v30.4s // .......................................................................................................................................................................|...............................................................................*........................................................................................................ + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|...................................................................................*.................................................................................................... + // mls v21.4s, v28.4s, v29.4s // .......................................................................................................................................................................|.........................................................................................*.............................................................................................. + // cmge v27.4s, v31.4s, v22.4s // .......................................................................................................................................................................|..........................................................*............................................................................................................................. + // cmge v28.4s, v22.4s, v30.4s // .......................................................................................................................................................................|.........................................................*.............................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|.............................................................*.......................................................................................................................... + // mls v22.4s, v28.4s, v29.4s // .......................................................................................................................................................................|.....................................................................*.................................................................................................................. + // cmge v27.4s, v31.4s, v23.4s // .......................................................................................................................................................................|......................................................................................................*................................................................................. + // cmge v28.4s, v23.4s, v30.4s // .......................................................................................................................................................................|.....................................................................................*.................................................................................................. + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|..............................................................................................................*......................................................................... + // mls v23.4s, v28.4s, v29.4s // ....................*..................................................................................................................................................|....................................................................................................................................*................................................... + // str q16, [x1, #(8*(512/8))] // .......................................................................................................................................................................|.................................................................*...................................................................................................................... + // str q17, [x1, #(9*(512/8))] // .......................................................................................................................................................................|.....................................*.................................................................................................................................................. + // str q18, [x1, #(10*(512/8))] // .......................................................................................................................................................................|................................................*....................................................................................................................................... + // str q19, [x1, #(11*(512/8))] // ........................*..............................................................................................................................................|........................................................................................................................................*............................................... + // str q20, [x1, #(12*(512/8))] // .......................................................................................................................................................................|..........................*............................................................................................................................................................. + // str q21, [x1, #(13*(512/8))] // .......................................................................................................................................................................|..............................................................................................*......................................................................................... + // str q22, [x1, #(14*(512/8))] // .......................................................................................................................................................................|....................................................................................*................................................................................................... + // str q23, [x1, #(15*(512/8))] // ......................................*................................................................................................................................|......................................................................................................................................................*................................. + // mul v16.4s, v8.4s, v25.4s // .......................................................................................................................................................................|...........................................................................*............................................................................................................ + // sqrdmulh v8.4s, v8.4s, v26.4s // .......................................................................................................................................................................|.......................................................................................*................................................................................................ + // mls v16.4s, v8.4s, v29.4s // .......................................................................................................................................................................|.............................................................................................*.......................................................................................... + // mul v17.4s, v9.4s, v25.4s // .......................................................................................................................................................................|.........................................................................*.............................................................................................................. + // sqrdmulh v9.4s, v9.4s, v26.4s // .......................................................................................................................................................................|..................................................................*..................................................................................................................... + // mls v17.4s, v9.4s, v29.4s // .......................................................................................................................................................................|..............................................................................*......................................................................................................... + // mul v18.4s, v10.4s, v25.4s // .......................................................................................................................................................................|................................................................................................*....................................................................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ........*..............................................................................................................................................................|........................................................................................................................*............................................................... + // mls v18.4s, v10.4s, v29.4s // ................*......................................................................................................................................................|................................................................................................................................*....................................................... + // mul v19.4s, v11.4s, v25.4s // .......................................................................................................................................................................|.......................................................................*................................................................................................................ + // sqrdmulh v11.4s, v11.4s, v26.4s // .......................................................................................................................................................................|....................................................................*................................................................................................................... + // mls v19.4s, v11.4s, v29.4s // .......................................................................................................................................................................|.............................................................................*.......................................................................................................... + // mul v20.4s, v12.4s, v25.4s // .......................................................................................................................................................................|...................................*.................................................................................................................................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // .......................................................................................................................................................................|.......................................*................................................................................................................................................ + // mls v20.4s, v12.4s, v29.4s // .......................................................................................................................................................................|.................................................*...................................................................................................................................... + // mul v21.4s, v13.4s, v25.4s // .......................................................................................................................................................................|........................................*............................................................................................................................................... + // sqrdmulh v13.4s, v13.4s, v26.4s // .......................................................................................................................................................................|.........................................*.............................................................................................................................................. + // mls v21.4s, v13.4s, v29.4s // .......................................................................................................................................................................|..............................................*......................................................................................................................................... + // mul v22.4s, v14.4s, v25.4s // .......................................................................................................................................................................|.....................................................*.................................................................................................................................. + // sqrdmulh v14.4s, v14.4s, v26.4s // .......................................................................................................................................................................|......................................................*................................................................................................................................. + // mls v22.4s, v14.4s, v29.4s // .......................................................................................................................................................................|...............................................................................................*........................................................................................ + // mul v23.4s, v15.4s, v25.4s // .......................................................................................................................................................................|............................................................*........................................................................................................................... + // sqrdmulh v15.4s, v15.4s, v26.4s // .......................................................................................................................................................................|................................................................*....................................................................................................................... + // mls v23.4s, v15.4s, v29.4s // .......................................................................................................................................................................|......................................................................*................................................................................................................. + // cmge v27.4s, v31.4s, v16.4s // .......................................................................................................................................................................|.....................................................................................................*.................................................................................. + // cmge v28.4s, v16.4s, v30.4s // .......................................................................................................................................................................|....................................................................................................*................................................................................... + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|........................................................................................................*............................................................................... + // mls v16.4s, v28.4s, v29.4s // .......................................................................................................................................................................|............................................................................................................*........................................................................... + // cmge v27.4s, v31.4s, v17.4s // .......................................................................................................................................................................|.........................................................................................................*.............................................................................. + // cmge v28.4s, v17.4s, v30.4s // .......................................................................................................................................................................|...........................................................................................*............................................................................................ + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|.............................................................................................................*.......................................................................... + // mls v17.4s, v28.4s, v29.4s // .....................................*.................................................................................................................................|.....................................................................................................................................................*.................................. + // cmge v27.4s, v31.4s, v18.4s // ..........................................................*............................................................................................................|..........................................................................................................................................................................*............. + // cmge v28.4s, v18.4s, v30.4s // ......................................................*................................................................................................................|......................................................................................................................................................................*................. + // sub v28.4s, v27.4s, v28.4s // .............................................................*.........................................................................................................|.............................................................................................................................................................................*.......... + // mls v18.4s, v28.4s, v29.4s // .................................................................*.....................................................................................................|.................................................................................................................................................................................*...... + // cmge v27.4s, v31.4s, v19.4s // .......................................................................................................................................................................|..................................................................................*..................................................................................................... + // cmge v28.4s, v19.4s, v30.4s // .......................................................................................................................................................................|...................................................................................................*.................................................................................... + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|.......................................................................................................*................................................................................ + // mls v19.4s, v28.4s, v29.4s // .......................................................................................................................................................................|..........................................................................................................*............................................................................. + // cmge v27.4s, v31.4s, v20.4s // .......................................................................................................................................................................|...............................................................................................................*........................................................................ + // cmge v28.4s, v20.4s, v30.4s // ..*....................................................................................................................................................................|..................................................................................................................*..................................................................... + // sub v28.4s, v27.4s, v28.4s // .......*...............................................................................................................................................................|.......................................................................................................................*................................................................ + // mls v20.4s, v28.4s, v29.4s // .............*.........................................................................................................................................................|.............................................................................................................................*.......................................................... + // cmge v27.4s, v31.4s, v21.4s // .......................................................................................................................................................................|..........................................................................*............................................................................................................. + // cmge v28.4s, v21.4s, v30.4s // .......................................................................................................................................................................|............................................................................*........................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|................................................................................*....................................................................................................... + // mls v21.4s, v28.4s, v29.4s // .......................................................................................................................................................................|..........................................................................................*............................................................................................. + // cmge v27.4s, v31.4s, v22.4s // ....*..................................................................................................................................................................|....................................................................................................................*................................................................... + // cmge v28.4s, v22.4s, v30.4s // ......*................................................................................................................................................................|......................................................................................................................*................................................................. + // sub v28.4s, v27.4s, v28.4s // ......................*................................................................................................................................................|......................................................................................................................................*................................................. + // mls v22.4s, v28.4s, v29.4s // ...............................*.......................................................................................................................................|...............................................................................................................................................*........................................ + // cmge v27.4s, v31.4s, v23.4s // .......................................................................................................................................................................|........................................................................................*............................................................................................... + // cmge v28.4s, v23.4s, v30.4s // .......................................................................................................................................................................|......................................................................................*................................................................................................. + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|............................................................................................*........................................................................................... + // mls v23.4s, v28.4s, v29.4s // ...........*...........................................................................................................................................................|...........................................................................................................................*............................................................ + // str q16, [x1], #(16) // .*.....................................................................................................................................................................|.................................................................................................................*...................................................................... + // str q17, [x1, #(-16 + 1*(512/8))] // .........................................................*.............................................................................................................|.........................................................................................................................................................................*.............. + // str q18, [x1, #(-16 + 2*(512/8))] // .......................................................................*...............................................................................................|.......................................................................................................................................................................................* + // str q19, [x1, #(-16 + 3*(512/8))] // .....*.................................................................................................................................................................|.....................................................................................................................*.................................................................. + // str q20, [x1, #(-16 + 4*(512/8))] // ...................*...................................................................................................................................................|...................................................................................................................................*.................................................... + // str q21, [x1, #(-16 + 5*(512/8))] // .......................................................................................................................................................................|.................................................................................................*...................................................................................... + // str q22, [x1, #(-16 + 6*(512/8))] // .........................................*.............................................................................................................................|.........................................................................................................................................................*.............................. + // str q23, [x1, #(-16 + 7*(512/8))] // ................................*......................................................................................................................................|................................................................................................................................................*....................................... + + sub count, count, #1 + cbnz count, layer1234_start + mls v17.4S, v14.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + sqrdmulh v14.4S, v24.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + mul v24.4S, v22.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + sqrdmulh v22.4S, v22.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + mls v18.4S, v23.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + sub v23.4S, v13.4S, v20.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v13.4S, v20.4S // ..................................................................................................................................................................*..................................................................................................................... + mls v15.4S, v14.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + sqrdmulh v14.4S, v21.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mls v24.4S, v22.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + mul v20.4S, v21.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + mul v21.4S, v23.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + sqrdmulh v22.4S, v23.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + sqrdmulh v23.4S, v28.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + mul v28.4S, v28.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + mls v20.4S, v14.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + cmge v14.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + mls v21.4S, v22.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v22.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + mls v28.4S, v23.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + cmge v23.4S, v31.4S, v24.4S // ................................................................................................................................................................................................*....................................................................................... + sub v22.4S, v22.4S, v14.4S // ......................................................................................................................................................................................*................................................................................................. + sub v14.4S, v8.4S, v16.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v16.4S // .........................................................................................................................................*.............................................................................................................................................. + cmge v16.4S, v24.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v17.4S, v22.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + add v22.4S, v28.4S, v27.4S // .......................................................................................................................................................................*................................................................................................................ + sub v23.4S, v23.4S, v16.4S // ..................................................................................................................................................................................................*..................................................................................... + str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + mul v16.4S, v14.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + sub v28.4S, v28.4S, v27.4S // ......................................................................................................................................................................*................................................................................................................. + mls v24.4S, v23.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + sqrdmulh v23.4S, v22.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + mul v22.4S, v22.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + sqrdmulh v17.4S, v14.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + cmge v14.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + sub v27.4S, v15.4S, v20.4S // ...........................................................................................................................................................................*............................................................................................................ + str q24, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + cmge v24.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + add v15.4S, v15.4S, v20.4S // ............................................................................................................................................................................*........................................................................................................... + sqrdmulh v20.4S, v28.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mls v16.4S, v17.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + mls v22.4S, v23.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + sub v14.4S, v14.4S, v24.4S // ..........................................................................................................................................................................................*............................................................................................. + mul v17.4S, v28.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + sqrdmulh v24.4S, v27.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + mls v18.4S, v14.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + cmge v23.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + cmge v28.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + cmge v14.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + mls v17.4S, v20.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + cmge v20.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................*................................................................................... + sub v28.4S, v28.4S, v23.4S // ..................................................................................................................................................................................*..................................................................................................... + mul v23.4S, v27.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sqrdmulh v27.4S, v19.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mul v19.4S, v19.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sub v14.4S, v20.4S, v14.4S // ......................................................................................................................................................................................................*................................................................................. + str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + mls v16.4S, v28.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + mul v28.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + sqrdmulh v18.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + sqrdmulh v13.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v23.4S, v24.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + mls v19.4S, v27.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + mls v21.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + sqrdmulh v16.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mls v28.4S, v18.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v24.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + cmge v20.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + cmge v14.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... + cmge v18.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + str q21, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + cmge v21.4S, v31.4S, v28.4S // ....................................................................................................................................................................................................................................................................*................... + cmge v27.4S, v28.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + sub v14.4S, v14.4S, v18.4S // ..............................................................................................................................................................................................*......................................................................................... + sub v18.4S, v20.4S, v24.4S // ..............................................................................................................................................................................................................*......................................................................... + sub v27.4S, v21.4S, v27.4S // ......................................................................................................................................................................................................................................................................*................. + cmge v20.4S, v31.4S, v17.4S // ........................................................................................................................................................................................................*............................................................................... + mls v23.4S, v18.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + cmge v21.4S, v17.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + mls v28.4S, v27.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + mls v19.4S, v14.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + sub v20.4S, v20.4S, v21.4S // ..........................................................................................................................................................................................................*............................................................................. + sqrdmulh v27.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + mls v17.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + sqrdmulh v14.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + mul v12.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + str q28, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + str q17, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + sqrdmulh v21.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + mul v15.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + mls v20.4S, v27.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + mls v18.4S, v14.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + sqrdmulh v27.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + mls v17.4S, v16.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + cmge v23.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + mls v15.4S, v13.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + cmge v11.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... + cmge v8.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + mls v16.4S, v27.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + mls v12.4S, v21.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + sub v9.4S, v11.4S, v8.4S // ..........................................................................................................................................................................................................................................................*............................. + cmge v19.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + cmge v28.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v21.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................*........................... + sub v14.4S, v21.4S, v19.4S // ..............................................................................................................................................................................................................................................................*......................... + cmge v11.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + mls v18.4S, v9.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + mls v15.4S, v14.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + cmge v13.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + sub v24.4S, v27.4S, v11.4S // ......................................................................................................................................................................................................................................................*................................. + cmge v27.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + str q18, [x1, #128] // ..................................................................................................................................................................................................................................................................................*..... + cmge v8.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + mls v17.4S, v24.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + sub v28.4S, v28.4S, v27.4S // ..........................................................................................................................................................................................................................................................................*............. + str q15, [x1, #192] // ...................................................................................................................................................................................................................................................................................*.... + cmge v21.4S, v12.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v24.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + sub v15.4S, v23.4S, v8.4S // ..................................................................................................................................................................................................................................................................*..................... + cmge v27.4S, v31.4S, v12.4S // ............................................................................................................................................................................................................................................................................*........... + mls v22.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + sub v13.4S, v13.4S, v24.4S // ..................................................................................................................................................................................................................................................*..................................... + str q17, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + mls v20.4S, v15.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + sub v14.4S, v27.4S, v21.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v16.4S, v13.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + mls v12.4S, v14.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + str q22, [x1, #384] // ......................................................................................................................................................................................................................................................................................*. + str q20, [x1, #256] // ....................................................................................................................................................................................................................................................................................*... + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + str q12, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_m1_icestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_m1_icestorm.s new file mode 100644 index 0000000..e734625 --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_m1_icestorm.s @@ -0,0 +1,1764 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_opt_m1_icestorm + .global _intt_dilithium_1234_5678_opt_m1_icestorm + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_opt_m1_icestorm: +_intt_dilithium_1234_5678_opt_m1_icestorm: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 + ld4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x0] // *........................................ + ldr q5, [x3, #32] // ..*...................................... + ldr q8, [x3], #(6*16) // .*....................................... + // gap // ......................................... + ldr q9, [x3, #-48] // ....*.................................... + ldr q0, [x3, #-80] // ...*..................................... + // gap // ......................................... + // gap // ......................................... + ldr q2, [x3, #-32] // .....*................................... + ldr q12, [x3, #-16] // ......*.................................. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + sub v15.4S, v19.4S, v20.4S // .......*................................. + add v19.4S, v19.4S, v20.4S // ........*................................ + // gap // ......................................... + // gap // ......................................... + sub v13.4S, v21.4S, v22.4S // .........*............................... + add v18.4S, v21.4S, v22.4S // ..............*.......................... + // gap // ......................................... + // gap // ......................................... + mul v5.4S, v15.4S, v5.4S // ..........*.............................. + sqrdmulh v9.4S, v15.4S, v9.4S // ...........*............................. + // gap // ......................................... + // gap // ......................................... + mul v2.4S, v13.4S, v2.4S // ............*............................ + sqrdmulh v12.4S, v13.4S, v12.4S // .............*........................... + // gap // ......................................... + // gap // ......................................... + sub v15.4S, v19.4S, v18.4S // .................*....................... + add v19.4S, v19.4S, v18.4S // ..................*...................... + // gap // ......................................... + // gap // ......................................... + mls v5.4S, v9.4S, v29.4S // ...............*......................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mls v2.4S, v12.4S, v29.4S // ................*........................ + mul v9.4S, v15.4S, v8.4S // .....................*................... + // gap // ......................................... + // gap // ......................................... + sqrdmulh v12.4S, v15.4S, v0.4S // ......................*.................. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + sub v15.4S, v5.4S, v2.4S // ...................*..................... + add v5.4S, v5.4S, v2.4S // ....................*.................... + // gap // ......................................... + // gap // ......................................... + mls v9.4S, v12.4S, v29.4S // ...........................*............. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mul v8.4S, v15.4S, v8.4S // .......................*................. + sqrdmulh v0.4S, v15.4S, v0.4S // ........................*................ + // gap // ......................................... + // gap // ......................................... + trn1 v2.4S, v19.4S, v5.4S // .........................*............... + trn2 v19.4S, v19.4S, v5.4S // ............................*............ + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mls v8.4S, v0.4S, v29.4S // ..........................*.............. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + trn1 v5.4S, v9.4S, v8.4S // .............................*........... + trn2 v8.4S, v9.4S, v8.4S // ..............................*.......... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + trn2 v7.2D, v2.2D, v5.2D // ...............................*......... + trn2 v12.2D, v19.2D, v8.2D // ................................*........ + // gap // ......................................... + // gap // ......................................... + trn1 v2.2D, v2.2D, v5.2D // .................................*....... + trn1 v0.2D, v19.2D, v8.2D // ..................................*...... + // gap // ......................................... + // gap // ......................................... + add v8.4S, v7.4S, v12.4S // ...................................*..... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + add v9.4S, v2.4S, v0.4S // ....................................*.... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + add v19.4S, v9.4S, v8.4S // .....................................*... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + srshr v5.4S, v19.4S, #23 // ......................................*.. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mls v19.4S, v5.4S, v29.4S // .......................................*. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + str q19, [x0], #(16*4) // ........................................* + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + + // original source code + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x0] // *........................................ + // ldr q26, [x3], #(6*16) // ..*...................................... + // ldr q27, [x3, #-64] // .*....................................... + // ldr q3, [x3, #-80] // ....*.................................... + // ldr q17, [x3, #-48] // ...*..................................... + // ldr q18, [x3, #-32] // .....*................................... + // ldr q20, [x3, #-16] // ......*.................................. + // sub v22.4S, v13.4S, v14.4S // .......*................................. + // add v5.4S, v13.4S, v14.4S // ........*................................ + // sub v9.4S, v15.4S, v16.4S // .........*............................... + // mul v11.4S, v22.4S, v27.4S // ...........*............................. + // sqrdmulh v24.4S, v22.4S, v17.4S // ............*............................ + // mul v28.4S, v9.4S, v18.4S // .............*........................... + // sqrdmulh v21.4S, v9.4S, v20.4S // ..............*.......................... + // add v14.4S, v15.4S, v16.4S // ..........*.............................. + // mls v11.4S, v24.4S, v29.4S // .................*....................... + // mls v28.4S, v21.4S, v29.4S // ..................*...................... + // sub v6.4S, v5.4S, v14.4S // ...............*......................... + // add v14.4S, v5.4S, v14.4S // ................*........................ + // sub v7.4S, v11.4S, v28.4S // .....................*................... + // add v13.4S, v11.4S, v28.4S // ......................*.................. + // mul v0.4S, v6.4S, v26.4S // ...................*..................... + // sqrdmulh v23.4S, v6.4S, v3.4S // ....................*.................... + // mul v18.4S, v7.4S, v26.4S // ........................*................ + // sqrdmulh v20.4S, v7.4S, v3.4S // .........................*............... + // trn1 v26.4S, v14.4S, v13.4S // ..........................*.............. + // mls v18.4S, v20.4S, v29.4S // ............................*............ + // mls v0.4S, v23.4S, v29.4S // .......................*................. + // trn2 v25.4S, v14.4S, v13.4S // ...........................*............. + // trn1 v6.4S, v0.4S, v18.4S // .............................*........... + // trn2 v27.4S, v0.4S, v18.4S // ..............................*.......... + // trn2 v7.2D, v26.2D, v6.2D // ...............................*......... + // trn2 v12.2D, v25.2D, v27.2D // ................................*........ + // trn1 v2.2D, v26.2D, v6.2D // .................................*....... + // trn1 v0.2D, v25.2D, v27.2D // ..................................*...... + // add v8.4S, v7.4S, v12.4S // ...................................*..... + // add v9.4S, v2.4S, v0.4S // ....................................*.... + // add v22.4S, v9.4S, v8.4S // .....................................*... + // srshr v13.4S, v22.4S, #23 // ......................................*.. + // mls v22.4S, v13.4S, v29.4S // .......................................*. + // str q22, [x0], #(16*4) // ........................................* + + sub count, count, #1 +layer5678_start: + sub v19.4S, v9.4S, v8.4S // ...............................................*................. + ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x0] // e................................................................ + ldr q26, [x3], #(6*16) // .e............................................................... + ldr q27, [x3, #-64] // ...e............................................................. + ldr q3, [x3, #-80] // ..e.............................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q17, [x3, #-48] // ....e............................................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q18, [x3, #-32] // .....e........................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q20, [x3, #-16] // ......e.......................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v30.4S, v2.4S, v0.4S // .....................................*........................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v22.4S, v13.4S, v14.4S // .......e......................................................... + add v5.4S, v13.4S, v14.4S // ........e........................................................ + // gap // ................................................................. + // gap // ................................................................. + sub v9.4S, v15.4S, v16.4S // ............e.................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v11.4S, v22.4S, v27.4S // .........e....................................................... + sqrdmulh v24.4S, v22.4S, v17.4S // ..........e...................................................... + // gap // ................................................................. + // gap // ................................................................. + mul v28.4S, v9.4S, v18.4S // ..............e.................................................. + sqrdmulh v21.4S, v9.4S, v20.4S // ...............e................................................. + // gap // ................................................................. + // gap // ................................................................. + add v14.4S, v15.4S, v16.4S // .............e................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v11.4S, v24.4S, v29.4S // ...........e..................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v28.4S, v21.4S, v29.4S // ................e................................................ + sub v6.4S, v5.4S, v14.4S // .................e............................................... + // gap // ................................................................. + // gap // ................................................................. + add v14.4S, v5.4S, v14.4S // ..................e.............................................. + ldr q5, [x4], #8 // ...................................*............................. + ldr q10, [x4], #16 // ....................................*............................ + // gap // ................................................................. + sub v1.4S, v7.4S, v12.4S // ..........................................*...................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v7.4S, v11.4S, v28.4S // ......................e.......................................... + add v13.4S, v11.4S, v28.4S // .......................e......................................... + // gap // ................................................................. + // gap // ................................................................. + mul v0.4S, v6.4S, v26.4S // ...................e............................................. + sqrdmulh v23.4S, v6.4S, v3.4S // ....................e............................................ + // gap // ................................................................. + // gap // ................................................................. + mul v18.4S, v7.4S, v26.4S // ........................e........................................ + sqrdmulh v20.4S, v7.4S, v3.4S // .........................e....................................... + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v24.4S, v30.4S, v10.S[1] // ........................................*........................ + trn1 v26.4S, v14.4S, v13.4S // ...........................e..................................... + // gap // ................................................................. + // gap // ................................................................. + mul v30.4S, v30.4S, v10.S[0] // .......................................*......................... + mul v31.4S, v1.4S, v10.S[2] // ............................................*.................... + // gap // ................................................................. + // gap // ................................................................. + mls v18.4S, v20.4S, v29.4S // ..........................e...................................... + mls v0.4S, v23.4S, v29.4S // .....................e........................................... + // gap // ................................................................. + // gap // ................................................................. + trn2 v25.4S, v14.4S, v13.4S // ............................e.................................... + sqrdmulh v16.4S, v1.4S, v10.S[3] // .............................................*................... + // gap // ................................................................. + // gap // ................................................................. + mls v30.4S, v24.4S, v29.4S // .........................................*....................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + trn1 v6.4S, v0.4S, v18.4S // .............................e................................... + trn2 v27.4S, v0.4S, v18.4S // ..............................e.................................. + // gap // ................................................................. + // gap // ................................................................. + mls v31.4S, v16.4S, v29.4S // ..............................................*.................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + trn2 v7.2D, v26.2D, v6.2D // ...............................e................................. + trn2 v12.2D, v25.2D, v27.2D // ................................e................................ + // gap // ................................................................. + // gap // ................................................................. + trn1 v2.2D, v26.2D, v6.2D // .................................e............................... + trn1 v0.2D, v25.2D, v27.2D // ..................................e.............................. + // gap // ................................................................. + // gap // ................................................................. + sub v20.4S, v30.4S, v31.4S // ....................................................*............ + add v8.4S, v7.4S, v12.4S // ...........................................e..................... + // gap // ................................................................. + // gap // ................................................................. + add v14.4S, v30.4S, v31.4S // .....................................................*........... + add v9.4S, v2.4S, v0.4S // ......................................e.......................... + // gap // ................................................................. + // gap // ................................................................. + mul v18.4S, v19.4S, v5.S[0] // .................................................*............... + sqrdmulh v19.4S, v19.4S, v5.S[1] // ..................................................*.............. + // gap // ................................................................. + // gap // ................................................................. + srshr v30.4S, v14.4S, #23 // ...........................................................*..... + add v22.4S, v9.4S, v8.4S // ................................................e................ + // gap // ................................................................. + // gap // ................................................................. + mul v3.4S, v20.4S, v5.S[0] // ......................................................*.......... + sqrdmulh v26.4S, v20.4S, v5.S[1] // .......................................................*......... + // gap // ................................................................. + // gap // ................................................................. + mls v18.4S, v19.4S, v29.4S // ...................................................*............. + srshr v13.4S, v22.4S, #23 // .........................................................e....... + // gap // ................................................................. + // gap // ................................................................. + mls v14.4S, v30.4S, v29.4S // ............................................................*.... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v3.4S, v26.4S, v29.4S // ........................................................*........ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v22.4S, v13.4S, v29.4S // ..........................................................e...... + str q18, [x0, #-32] // ...............................................................*. + // gap // ................................................................. + // gap // ................................................................. + str q14, [x0, #-48] // ..............................................................*.. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q3, [x0, #-16] // ................................................................* + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q22, [x0], #(16*4) // .............................................................e... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + + // original source code + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0] // e...............................................................|e.............................................................. + // ldr q0, [x3], #(6*16) // .e..............................................................|.e............................................................. + // ldr q4, [x3, #(-6*16 + 1*16)] // ...e............................................................|...e........................................................... + // ldr q1, [x3, #(-6*16 + 2*16)] // ..e.............................................................|..e............................................................ + // ldr q5, [x3, #(-6*16 + 3*16)] // ....e...........................................................|....e.......................................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // .....e..........................................................|.....e......................................................... + // ldr q6, [x3, #(-6*16 + 5*16)] // ......e.........................................................|......e........................................................ + // sub v24.4s, v8.4s, v9.4s // ........e.......................................................|........e...................................................... + // add v8.4s, v8.4s, v9.4s // .........e......................................................|.........e..................................................... + // mul v9.4s, v24.4s, v1.4s // ...........e....................................................|...........e................................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ............e...................................................|............e.................................................. + // mls v9.4s, v24.4s, v29.4s // ................e...............................................|................e.............................................. + // sub v24.4s, v10.4s, v11.4s // ..........e.....................................................|..........e.................................................... + // add v10.4s, v10.4s, v11.4s // ...............e................................................|...............e............................................... + // mul v11.4s, v24.4s, v2.4s // .............e..................................................|.............e................................................. + // sqrdmulh v24.4s, v24.4s, v6.4s // ..............e.................................................|..............e................................................ + // mls v11.4s, v24.4s, v29.4s // .................e..............................................|.................e............................................. + // sub v24.4s, v8.4s, v10.4s // ..................e.............................................|..................e............................................ + // add v8.4s, v8.4s, v10.4s // ...................e............................................|...................e........................................... + // mul v10.4s, v24.4s, v0.4s // .........................e......................................|.........................e..................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..........................e.....................................|..........................e.................................... + // mls v10.4s, v24.4s, v29.4s // ..................................e.............................|..................................e............................ + // sub v24.4s, v9.4s, v11.4s // .......................e........................................|.......................e....................................... + // add v9.4s, v9.4s, v11.4s // ........................e.......................................|........................e...................................... + // mul v11.4s, v24.4s, v0.4s // ...........................e....................................|...........................e................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ............................e...................................|............................e.................................. + // mls v11.4s, v24.4s, v29.4s // .................................e..............................|.................................e............................. + // trn1 v25.4s, v8.4s, v9.4s // ..............................e.................................|..............................e................................ + // trn2 v26.4s, v8.4s, v9.4s // ...................................e............................|...................................e........................... + // trn1 v27.4s, v10.4s, v11.4s // ......................................e.........................|......................................e........................ + // trn2 v28.4s, v10.4s, v11.4s // .......................................e........................|.......................................e....................... + // trn2 v10.2d, v25.2d, v27.2d // .........................................e......................|.........................................e..................... + // trn2 v11.2d, v26.2d, v28.2d // ..........................................e.....................|..........................................e.................... + // trn1 v8.2d, v25.2d, v27.2d // ...........................................e....................|...........................................e................... + // trn1 v9.2d, v26.2d, v28.2d // ............................................e...................|............................................e.................. + // ldr q1, [x4], #8 // ....................*...........................................|....................*.......................................... + // ldr q0, [x4], #16 // .....................*..........................................|.....................*......................................... + // sub v24.4s, v8.4s, v9.4s // .......*........................................................|.......*....................................................... + // add v8.4s, v8.4s, v9.4s // ................................................e...............|................................................e.............. + // mul v9.4s, v24.4s, v0.s[0] // ...............................*................................|...............................*............................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................*..................................|.............................*................................. + // mls v9.4s, v24.4s, v29.4s // .....................................*..........................|.....................................*......................... + // sub v24.4s, v10.4s, v11.4s // ......................*.........................................|......................*........................................ + // add v10.4s, v10.4s, v11.4s // ..............................................e.................|..............................................e................ + // mul v11.4s, v24.4s, v0.s[2] // ................................*...............................|................................*.............................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ....................................*...........................|....................................*.......................... + // mls v11.4s, v24.4s, v29.4s // ........................................*.......................|........................................*...................... + // sub v24.4s, v8.4s, v10.4s // ................................................................*............................................................... + // add v8.4s, v8.4s, v10.4s // ....................................................e...........|....................................................e.......... + // mul v10.4s, v24.4s, v1.s[0] // .................................................*..............|.................................................*............. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..................................................*.............|..................................................*............ + // mls v10.4s, v24.4s, v29.4s // .......................................................*........|.......................................................*....... + // sub v24.4s, v9.4s, v11.4s // .............................................*..................|.............................................*................. + // add v9.4s, v9.4s, v11.4s // ...............................................*................|...............................................*............... + // mul v11.4s, v24.4s, v1.s[0] // .....................................................*..........|.....................................................*......... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................................................*.........|......................................................*........ + // mls v11.4s, v24.4s, v29.4s // ..........................................................*.....|..........................................................*.... + // srshr v24.4S, v8.4S, #23 // ........................................................e.......|........................................................e...... + // mls v8.4s, v24.4s, v29.4s // ...........................................................e....|...........................................................e... + // srshr v24.4S, v9.4S, #23 // ...................................................*............|...................................................*........... + // mls v9.4s, v24.4s, v29.4s // .........................................................*......|.........................................................*..... + // str q8, [x0], #(16*4) // ...............................................................e|............................................................... + // str q9, [x0, #(-16*4 + 1*16)] // .............................................................*..|.............................................................*. + // str q10, [x0, #(-16*4 + 2*16)] // ............................................................*...|............................................................*.. + // str q11, [x0, #(-16*4 + 3*16)] // ..............................................................*.|..............................................................* + + sub count, count, #1 + cbnz count, layer5678_start + sub v13.4S, v9.4S, v8.4S // *....................... + sub v8.4S, v2.4S, v0.4S // .*...................... + ldr q2, [x4], #8 // ..*..................... + ldr q5, [x4], #16 // ...*.................... + sub v19.4S, v7.4S, v12.4S // ....*................... + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + sqrdmulh v0.4S, v8.4S, v5.S[1] // .....*.................. + mul v9.4S, v8.4S, v5.S[0] // ......*................. + // gap // ........................ + // gap // ........................ + mul v8.4S, v19.4S, v5.S[2] // .......*................ + sqrdmulh v5.4S, v19.4S, v5.S[3] // ........*............... + // gap // ........................ + // gap // ........................ + mul v16.4S, v13.4S, v2.S[0] // .............*.......... + // gap // ........................ + // gap // ........................ + // gap // ........................ + mls v9.4S, v0.4S, v29.4S // .........*.............. + sqrdmulh v19.4S, v13.4S, v2.S[1] // ..............*......... + // gap // ........................ + // gap // ........................ + mls v8.4S, v5.4S, v29.4S // ..........*............. + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + mls v16.4S, v19.4S, v29.4S // ..................*..... + // gap // ........................ + // gap // ........................ + // gap // ........................ + sub v5.4S, v9.4S, v8.4S // ...........*............ + // gap // ........................ + // gap // ........................ + // gap // ........................ + add v9.4S, v9.4S, v8.4S // ............*........... + // gap // ........................ + // gap // ........................ + // gap // ........................ + sqrdmulh v19.4S, v5.4S, v2.S[1] // .................*...... + mul v5.4S, v5.4S, v2.S[0] // ................*....... + str q16, [x0, #-32] // .....................*.. + // gap // ........................ + srshr v8.4S, v9.4S, #23 // ...............*........ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + mls v5.4S, v19.4S, v29.4S // ....................*... + // gap // ........................ + // gap // ........................ + // gap // ........................ + mls v9.4S, v8.4S, v29.4S // ...................*.... + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + str q5, [x0, #-16] // .......................* + // gap // ........................ + // gap // ........................ + // gap // ........................ + str q9, [x0, #-48] // ......................*. + // gap // ........................ + // gap // ........................ + // gap // ........................ + + // original source code + // sub v19.4S, v9.4S, v8.4S // *....................... + // sub v30.4S, v2.4S, v0.4S // .*...................... + // ldr q5, [x4], #8 // ..*..................... + // ldr q10, [x4], #16 // ...*.................... + // sub v1.4S, v7.4S, v12.4S // ....*................... + // sqrdmulh v24.4S, v30.4S, v10.S[1] // .....*.................. + // mul v30.4S, v30.4S, v10.S[0] // ......*................. + // mul v31.4S, v1.4S, v10.S[2] // .......*................ + // sqrdmulh v16.4S, v1.4S, v10.S[3] // ........*............... + // mls v30.4S, v24.4S, v29.4S // ..........*............. + // mls v31.4S, v16.4S, v29.4S // ............*........... + // sub v20.4S, v30.4S, v31.4S // ..............*......... + // add v14.4S, v30.4S, v31.4S // ...............*........ + // mul v18.4S, v19.4S, v5.S[0] // .........*.............. + // sqrdmulh v19.4S, v19.4S, v5.S[1] // ...........*............ + // srshr v30.4S, v14.4S, #23 // ...................*.... + // mul v3.4S, v20.4S, v5.S[0] // .................*...... + // sqrdmulh v26.4S, v20.4S, v5.S[1] // ................*....... + // mls v18.4S, v19.4S, v29.4S // .............*.......... + // mls v14.4S, v30.4S, v29.4S // .....................*.. + // mls v3.4S, v26.4S, v29.4S // ....................*... + // str q18, [x0, #-32] // ..................*..... + // str q14, [x0, #-48] // .......................* + // str q3, [x0, #-16] // ......................*. + + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 + ldr q14, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + ldr q28, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + ldr q22, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + ldr q19, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + ldr q27, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q12, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + add v17.4S, v28.4S, v14.4S // ...........................*............................................................................................................................................................................................................................................................ + sub v9.4S, v28.4S, v14.4S // ..........................*............................................................................................................................................................................................................................................................. + ldr q23, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + sub v8.4S, v22.4S, v19.4S // ....................................*................................................................................................................................................................................................................................................... + ldr q15, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + add v21.4S, v22.4S, v19.4S // .....................................*.................................................................................................................................................................................................................................................. + ldr q20, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + mul v24.4S, v9.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v9.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... + mul v9.4S, v8.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. + sqrdmulh v28.4S, v8.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + ldr q10, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + sub v16.4S, v23.4S, v12.4S // ...................................................*.................................................................................................................................................................................................................................... + add v22.4S, v23.4S, v12.4S // ....................................................*................................................................................................................................................................................................................................... + ldr q19, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + mls v24.4S, v14.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + ldr q14, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + sub v11.4S, v20.4S, v15.4S // ................*....................................................................................................................................................................................................................................................................... + sqrdmulh v8.4S, v16.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. + mul v23.4S, v16.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. + add v15.4S, v20.4S, v15.4S // .................*...................................................................................................................................................................................................................................................................... + ldr q12, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + add v13.4S, v27.4S, v14.4S // ................................*....................................................................................................................................................................................................................................................... + sub v14.4S, v27.4S, v14.4S // ...............................*........................................................................................................................................................................................................................................................ + sqrdmulh v20.4S, v11.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... + mul v11.4S, v11.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... + mul v16.4S, v14.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v14.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... + sub v18.4S, v17.4S, v13.4S // ..................................................................*..................................................................................................................................................................................................................... + add v17.4S, v17.4S, v13.4S // ...................................................................*.................................................................................................................................................................................................................... + mls v11.4S, v20.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + add v13.4S, v19.4S, v10.4S // ......................*................................................................................................................................................................................................................................................................. + mls v16.4S, v14.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v18.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. + sub v20.4S, v15.4S, v13.4S // ........................................................*............................................................................................................................................................................................................................... + mul v18.4S, v18.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... + mls v23.4S, v8.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + add v27.4S, v15.4S, v13.4S // .........................................................*.............................................................................................................................................................................................................................. + ldr q13, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + mul v8.4S, v20.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. + sqrdmulh v20.4S, v20.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ + mls v18.4S, v14.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + add v15.4S, v24.4S, v16.4S // ........................................................................*............................................................................................................................................................................................................... + mls v9.4S, v28.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + sub v19.4S, v19.4S, v10.4S // .....................*.................................................................................................................................................................................................................................................................. + mls v8.4S, v20.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + sub v10.4S, v13.4S, v12.4S // .........................................*.............................................................................................................................................................................................................................................. + mul v28.4S, v19.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ + sqrdmulh v19.4S, v19.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... + add v12.4S, v13.4S, v12.4S // ..........................................*............................................................................................................................................................................................................................................. + sqrdmulh v13.4S, v10.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... + sub v20.4S, v8.4S, v18.4S // ..........................................................................................................*............................................................................................................................................................................. + mls v28.4S, v19.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + sub v24.4S, v24.4S, v16.4S // .......................................................................*................................................................................................................................................................................................................ + mul v14.4S, v20.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + mul v19.4S, v10.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ + sqrdmulh v16.4S, v20.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + ldr q20, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + add v10.4S, v8.4S, v18.4S // ...........................................................................................................*............................................................................................................................................................................ + sub v18.4S, v21.4S, v12.4S // ............................................................................*........................................................................................................................................................................................................... + sub v8.4S, v11.4S, v28.4S // .............................................................*.......................................................................................................................................................................................................................... + add v28.4S, v11.4S, v28.4S // ..............................................................*......................................................................................................................................................................................................................... + mls v19.4S, v13.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + ldr q13, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + mls v14.4S, v16.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + add v16.4S, v21.4S, v12.4S // .............................................................................*.......................................................................................................................................................................................................... + mul v11.4S, v8.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ + sqrdmulh v8.4S, v8.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... + sqrdmulh v21.4S, v18.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + mul v18.4S, v18.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... + add v12.4S, v13.4S, v20.4S // ...............................................*........................................................................................................................................................................................................................................ + sub v20.4S, v13.4S, v20.4S // ..............................................*......................................................................................................................................................................................................................................... + mls v11.4S, v8.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + add v8.4S, v27.4S, v17.4S // .................................................................................................*...................................................................................................................................................................................... + mls v18.4S, v21.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + sub v21.4S, v12.4S, v22.4S // ......................................................................................*................................................................................................................................................................................................. + sub v17.4S, v27.4S, v17.4S // ................................................................................................*....................................................................................................................................................................................... + add v12.4S, v12.4S, v22.4S // .......................................................................................*................................................................................................................................................................................................ + sqrdmulh v22.4S, v24.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. + mul v13.4S, v24.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. + sub v24.4S, v28.4S, v15.4S // .....................................................................................................*.................................................................................................................................................................................. + add v27.4S, v16.4S, v12.4S // .....................................................................................................................*.................................................................................................................................................................. + add v15.4S, v28.4S, v15.4S // ......................................................................................................*................................................................................................................................................................................. + sub v28.4S, v9.4S, v19.4S // .................................................................................*...................................................................................................................................................................................................... + add v9.4S, v9.4S, v19.4S // ..................................................................................*..................................................................................................................................................................................................... + mls v13.4S, v22.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + mul v19.4S, v21.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... + sub v12.4S, v16.4S, v12.4S // ....................................................................................................................*................................................................................................................................................................... + sqrdmulh v22.4S, v21.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. + sqrdmulh v16.4S, v20.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... + add v21.4S, v11.4S, v13.4S // ................................................................................................................*....................................................................................................................................................................... + sub v13.4S, v11.4S, v13.4S // ...............................................................................................................*........................................................................................................................................................................ + mul v11.4S, v20.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... + mul v20.4S, v28.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + mls v19.4S, v22.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + sqrdmulh v28.4S, v28.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + sub v22.4S, v8.4S, v27.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v27.4S // .........................................................................................................................................*.............................................................................................................................................. + mls v11.4S, v16.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + mul v16.4S, v13.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + mls v20.4S, v28.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + sqrdmulh v28.4S, v13.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + sqrdmulh v13.4S, v12.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ + mul v12.4S, v12.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. + add v27.4S, v11.4S, v23.4S // ............................................................................................*........................................................................................................................................................................................... + sub v23.4S, v11.4S, v23.4S // ...........................................................................................*............................................................................................................................................................................................ + mls v12.4S, v13.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + add v13.4S, v9.4S, v27.4S // ..........................................................................................................................*............................................................................................................................................................. + sub v27.4S, v9.4S, v27.4S // .........................................................................................................................*.............................................................................................................................................................. + mul v11.4S, v23.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... + add v9.4S, v15.4S, v13.4S // ..............................................................................................................................................*......................................................................................................................................... + sub v13.4S, v15.4S, v13.4S // .............................................................................................................................................*.......................................................................................................................................... + sqrdmulh v23.4S, v23.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... + mul v15.4S, v27.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ + sqrdmulh v27.4S, v27.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + mls v16.4S, v28.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + mul v28.4S, v17.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... + sqrdmulh v17.4S, v17.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... + mls v11.4S, v23.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + sqrdmulh v23.4S, v24.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... + mul v24.4S, v24.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ + mls v15.4S, v27.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + mls v28.4S, v17.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + mul v27.4S, v13.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + add v17.4S, v20.4S, v11.4S // ....................................................................................................................................*................................................................................................................................................... + sqrdmulh v13.4S, v13.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + sub v11.4S, v20.4S, v11.4S // ...................................................................................................................................*.................................................................................................................................................... + mls v24.4S, v23.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + sub v23.4S, v28.4S, v12.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v28.4S, v12.4S // .............................................................................................................................................................*.......................................................................................................................... + sqrdmulh v28.4S, v11.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mul v20.4S, v11.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + mls v27.4S, v13.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + sqrdmulh v13.4S, v23.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + sub count, count, #1 +layer1234_start: + mls v20.4S, v28.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mul v23.4S, v23.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + add v11.4S, v21.4S, v17.4S // ........................................................................................................................................................*............................................................................................................................... + sub v28.4S, v21.4S, v17.4S // .......................................................................................................................................................*................................................................................................................................ + sub v17.4S, v18.4S, v19.4S // ..............................................................................................................................*......................................................................................................................................................... + sub v21.4S, v24.4S, v15.4S // .................................................................................................................................................................*...................................................................................................................... + add v19.4S, v18.4S, v19.4S // ...............................................................................................................................*........................................................................................................................................................ + mls v23.4S, v13.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + sqrdmulh v18.4S, v21.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + mul v21.4S, v21.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + add v13.4S, v24.4S, v15.4S // ..................................................................................................................................................................*..................................................................................................................... + add v15.4S, v16.4S, v20.4S // ............................................................................................................................................................................*........................................................................................................... + sub v20.4S, v16.4S, v20.4S // ...........................................................................................................................................................................*............................................................................................................ + cmge v24.4S, v23.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v21.4S, v18.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v18.4S, v31.4S, v23.4S // ................................................................................................................................................................................................*....................................................................................... + mul v16.4S, v22.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + sub v18.4S, v18.4S, v24.4S // ..................................................................................................................................................................................................*..................................................................................... + sqrdmulh v24.4S, v22.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + mul v22.4S, v28.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v28.4S, v28.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mls v23.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + mul v18.4S, v17.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + mls v16.4S, v24.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + cmge v24.4S, v31.4S, v27.4S // ....................................................................................................................................................................................*................................................................................................... + mls v22.4S, v28.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + cmge v28.4S, v27.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + str q23, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + sub v24.4S, v24.4S, v28.4S // ......................................................................................................................................................................................*................................................................................................. + sqrdmulh v17.4S, v17.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + cmge v28.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + cmge v23.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................*................................................................................... + mls v27.4S, v24.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + cmge v24.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + sub v23.4S, v23.4S, v28.4S // ......................................................................................................................................................................................................*................................................................................. + mls v18.4S, v17.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + sqrdmulh v9.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mls v21.4S, v23.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + cmge v28.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + str q27, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + sub v23.4S, v14.4S, v18.4S // ......................................................................................................................................................................*................................................................................................................. + add v27.4S, v14.4S, v18.4S // .......................................................................................................................................................................*................................................................................................................ + mls v17.4S, v9.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + sub v14.4S, v24.4S, v28.4S // ..................................................................................................................................................................................*..................................................................................................... + sqrdmulh v28.4S, v23.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + sub v24.4S, v10.4S, v19.4S // ..................................................................................................................................................*..................................................................................................................................... + add v10.4S, v10.4S, v19.4S // ...................................................................................................................................................*.................................................................................................................................... + mls v16.4S, v14.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + mul v9.4S, v24.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v18.4S, v24.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + mul v24.4S, v23.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + mul v19.4S, v20.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + cmge v14.4S, v31.4S, v22.4S // ............................................................................................................................................................................................*........................................................................................... + sqrdmulh v23.4S, v20.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + str q21, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + mls v24.4S, v28.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + mls v9.4S, v18.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + sqrdmulh v20.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + sqrdmulh v28.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + cmge v13.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + cmge v18.4S, v9.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + cmge v16.4S, v31.4S, v9.4S // ........................................................................................................................................................................................*............................................................................................... + mls v21.4S, v20.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v20.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + sub v18.4S, v16.4S, v18.4S // ..........................................................................................................................................................................................*............................................................................................. + mls v19.4S, v23.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + sub v23.4S, v20.4S, v13.4S // ......................................................................................................................................................................................................................................................*................................. + cmge v20.4S, v31.4S, v24.4S // ........................................................................................................................................................................................................*............................................................................... + cmge v16.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + cmge v13.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + mls v17.4S, v23.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + cmge v23.4S, v24.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + sub v13.4S, v13.4S, v16.4S // ......................................................................................................................................................................................................................................................................*................. + sqrdmulh v16.4S, v27.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + sub v20.4S, v20.4S, v23.4S // ..........................................................................................................................................................................................................*............................................................................. + cmge v23.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................*........................................................................... + mls v21.4S, v13.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + str q17, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + mul v17.4S, v27.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + mls v24.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + sqrdmulh v27.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + str q21, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + mls v17.4S, v16.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + cmge v13.4S, v22.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + str q24, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + cmge v20.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + sqrdmulh v21.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + sub v13.4S, v14.4S, v13.4S // ..............................................................................................................................................................................................*......................................................................................... + mul v14.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + ldr q10, [x1, #336] // .....e.................................................................................................................................................................................................................................................................................. + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sub v23.4S, v23.4S, v20.4S // ..............................................................................................................................................................................................................*......................................................................... + mul v8.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sqrdmulh v20.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + mls v14.4S, v21.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + mls v19.4S, v23.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + mls v16.4S, v28.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sqrdmulh v28.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v8.4S, v20.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + str q19, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + cmge v11.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + cmge v21.4S, v17.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + cmge v23.4S, v8.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + mls v22.4S, v13.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + cmge v15.4S, v31.4S, v8.4S // ............................................................................................................................................................................................................................................................................*........... + mls v19.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + mls v9.4S, v18.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + ldr q13, [x1, #272] // ....e................................................................................................................................................................................................................................................................................... + cmge v28.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + cmge v18.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + str q9, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + cmge v12.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + str q22, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + cmge v24.4S, v31.4S, v17.4S // ........................................................................................................................................................................................................................................................................*............... + sub v15.4S, v15.4S, v23.4S // ..............................................................................................................................................................................................................................................................................*......... + ldr q23, [x1, #400] // ......e................................................................................................................................................................................................................................................................................. + sub v9.4S, v11.4S, v28.4S // ..................................................................................................................................................................................................................................................*..................................... + ldr q28, [x1, #464] // .......e................................................................................................................................................................................................................................................................................ + mls v20.4S, v27.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + sub v11.4S, v24.4S, v21.4S // ..........................................................................................................................................................................................................................................................................*............. + ldr q27, [x1, #528] // ........e............................................................................................................................................................................................................................................................................... + mls v16.4S, v9.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + add v9.4S, v13.4S, v10.4S // ...........................e............................................................................................................................................................................................................................................................ + mls v17.4S, v11.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + sub v22.4S, v13.4S, v10.4S // ..........................e............................................................................................................................................................................................................................................................. + ldr q10, [x1, #592] // .........e.............................................................................................................................................................................................................................................................................. + cmge v13.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + cmge v21.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + add v11.4S, v23.4S, v28.4S // ................................e....................................................................................................................................................................................................................................................... + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + sub v24.4S, v23.4S, v28.4S // ...............................e........................................................................................................................................................................................................................................................ + sub v28.4S, v13.4S, v21.4S // ..................................................................................................................................................................................................................................................................*..................... + str q17, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + add v17.4S, v9.4S, v11.4S // ...................................................................e.................................................................................................................................................................................................................... + add v21.4S, v27.4S, v10.4S // .....................................e.................................................................................................................................................................................................................................................. + sqrdmulh v23.4S, v22.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... + mul v13.4S, v22.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... + ldr q16, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... + ldr q22, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ + mls v8.4S, v15.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + sub v15.4S, v12.4S, v18.4S // ..............................................................................................................................................................................................................................................................*......................... + ldr q12, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... + mul v18.4S, v24.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v24.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... + mls v13.4S, v23.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... + mls v19.4S, v15.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + sub v23.4S, v16.4S, v22.4S // ...................................................e.................................................................................................................................................................................................................................... + add v22.4S, v16.4S, v22.4S // ....................................................e................................................................................................................................................................................................................................... + str q8, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + ldr q16, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... + mls v20.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + mls v18.4S, v24.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... + ldr q15, [x1, #64] // .e...................................................................................................................................................................................................................................................................................... + sub v28.4S, v9.4S, v11.4S // ..................................................................e..................................................................................................................................................................................................................... + sub v11.4S, v27.4S, v10.4S // ....................................e................................................................................................................................................................................................................................................... + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + ldr q9, [x1, #0] // e....................................................................................................................................................................................................................................................................................... + cmge v8.4S, v31.4S, v14.4S // ........................................................................................................................................................................................................................................................*............................... + cmge v24.4S, v14.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + add v19.4S, v13.4S, v18.4S // ........................................................................e............................................................................................................................................................................................................... + sub v10.4S, v13.4S, v18.4S // .......................................................................e................................................................................................................................................................................................................ + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + sub v18.4S, v8.4S, v24.4S // ..........................................................................................................................................................................................................................................................*............................. + add v13.4S, v12.4S, v16.4S // ......................e................................................................................................................................................................................................................................................................. + ldr q20, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. + ldr q8, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ + sub v24.4S, v9.4S, v15.4S // ................e....................................................................................................................................................................................................................................................................... + sqrdmulh v27.4S, v10.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. + mls v14.4S, v18.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + add v15.4S, v9.4S, v15.4S // .................e...................................................................................................................................................................................................................................................................... + sqrdmulh v9.4S, v24.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... + mul v18.4S, v24.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... + mul v24.4S, v10.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. + str q14, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + sub v10.4S, v20.4S, v8.4S // .........................................e.............................................................................................................................................................................................................................................. + sqrdmulh v14.4S, v28.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. + add v8.4S, v20.4S, v8.4S // ..........................................e............................................................................................................................................................................................................................................. + sub v20.4S, v15.4S, v13.4S // ........................................................e............................................................................................................................................................................................................................... + mls v24.4S, v27.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ + add v27.4S, v15.4S, v13.4S // .........................................................e.............................................................................................................................................................................................................................. + mls v18.4S, v9.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... + sub v16.4S, v12.4S, v16.4S // .....................e.................................................................................................................................................................................................................................................................. + mul v13.4S, v28.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... + sqrdmulh v15.4S, v10.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... + mul v28.4S, v16.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ + sqrdmulh v16.4S, v16.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... + mul v9.4S, v10.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ + sqrdmulh v12.4S, v23.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. + mul v23.4S, v23.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. + mls v13.4S, v14.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. + mls v28.4S, v16.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. + sub v16.4S, v18.4S, v28.4S // .............................................................e.......................................................................................................................................................................................................................... + add v28.4S, v18.4S, v28.4S // ..............................................................e......................................................................................................................................................................................................................... + ldr q18, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... + sqrdmulh v10.4S, v11.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ + mls v23.4S, v12.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ + mul v14.4S, v11.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. + sub v12.4S, v28.4S, v19.4S // .....................................................................................................e.................................................................................................................................................................................. + mls v9.4S, v15.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... + ldr q15, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... + add v11.4S, v28.4S, v19.4S // ......................................................................................................e................................................................................................................................................................................. + mul v19.4S, v20.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. + sqrdmulh v20.4S, v20.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ + mls v14.4S, v10.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... + sqrdmulh v28.4S, v16.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... + mul v16.4S, v16.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ + sub v10.4S, v15.4S, v18.4S // ..............................................e......................................................................................................................................................................................................................................... + mls v19.4S, v20.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... + sub v20.4S, v14.4S, v9.4S // .................................................................................e...................................................................................................................................................................................................... + add v9.4S, v14.4S, v9.4S // ..................................................................................e..................................................................................................................................................................................................... + mul v14.4S, v10.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... + sqrdmulh v10.4S, v10.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... + mls v16.4S, v28.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... + add v28.4S, v27.4S, v17.4S // .................................................................................................e...................................................................................................................................................................................... + add v15.4S, v15.4S, v18.4S // ...............................................e........................................................................................................................................................................................................................................ + sub v17.4S, v27.4S, v17.4S // ................................................................................................e....................................................................................................................................................................................... + mls v14.4S, v10.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... + add v10.4S, v19.4S, v13.4S // ...........................................................................................................e............................................................................................................................................................................ + sub v18.4S, v19.4S, v13.4S // ..........................................................................................................e............................................................................................................................................................................. + add v13.4S, v14.4S, v23.4S // ............................................................................................e........................................................................................................................................................................................... + sub v27.4S, v14.4S, v23.4S // ...........................................................................................e............................................................................................................................................................................................ + sub v23.4S, v21.4S, v8.4S // ............................................................................e........................................................................................................................................................................................................... + add v8.4S, v21.4S, v8.4S // .............................................................................e.......................................................................................................................................................................................................... + sub v19.4S, v16.4S, v24.4S // ...............................................................................................................e........................................................................................................................................................................ + add v21.4S, v16.4S, v24.4S // ................................................................................................................e....................................................................................................................................................................... + mul v14.4S, v18.4S, v0.S[2] // ............................................................................................................e........................................................................................................................................................................... + sqrdmulh v24.4S, v18.4S, v0.S[3] // .............................................................................................................e.......................................................................................................................................................................... + add v18.4S, v9.4S, v13.4S // ..........................................................................................................................e............................................................................................................................................................. + sub v9.4S, v9.4S, v13.4S // .........................................................................................................................e.............................................................................................................................................................. + mul v13.4S, v20.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... + sqrdmulh v16.4S, v20.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... + sub v20.4S, v15.4S, v22.4S // ......................................................................................e................................................................................................................................................................................................. + add v22.4S, v15.4S, v22.4S // .......................................................................................e................................................................................................................................................................................................ + mul v15.4S, v9.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + sqrdmulh v9.4S, v9.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... + mls v14.4S, v24.4S, v29.4S // ..............................................................................................................e......................................................................................................................................................................... + mul v24.4S, v12.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ + mls v13.4S, v16.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. + mul v16.4S, v19.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... + mls v15.4S, v9.4S, v29.4S // .............................................................................................................................e.......................................................................................................................................................... + sqrdmulh v12.4S, v12.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... + sqrdmulh v19.4S, v19.4S, v0.S[3] // ..................................................................................................................e..................................................................................................................................................................... + add v9.4S, v11.4S, v18.4S // ..............................................................................................................................................e......................................................................................................................................... + sub v11.4S, v11.4S, v18.4S // .............................................................................................................................................e.......................................................................................................................................... + mul v18.4S, v23.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... + sqrdmulh v23.4S, v23.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ + mls v16.4S, v19.4S, v29.4S // ...................................................................................................................e.................................................................................................................................................................... + mls v24.4S, v12.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. + mls v18.4S, v23.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... + mul v12.4S, v17.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... + sub v19.4S, v8.4S, v22.4S // ....................................................................................................................e................................................................................................................................................................... + add v8.4S, v8.4S, v22.4S // .....................................................................................................................e.................................................................................................................................................................. + sqrdmulh v22.4S, v17.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... + sqrdmulh v23.4S, v19.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ + mul v19.4S, v19.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. + mul v17.4S, v27.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... + sqrdmulh v27.4S, v27.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... + mls v12.4S, v22.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... + sub v22.4S, v28.4S, v8.4S // ........................................................................................................................................e............................................................................................................................................... + mls v19.4S, v23.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... + add v8.4S, v28.4S, v8.4S // .........................................................................................................................................e.............................................................................................................................................. + sqrdmulh v28.4S, v11.4S, v0.S[1] // ................................................................................................................................................e....................................................................................................................................... + mls v17.4S, v27.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ + mul v27.4S, v11.4S, v0.S[0] // ...............................................................................................................................................e........................................................................................................................................ + sqrdmulh v11.4S, v20.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. + sub v23.4S, v12.4S, v19.4S // ............................................................................................................................................................e........................................................................................................................... + add v12.4S, v12.4S, v19.4S // .............................................................................................................................................................e.......................................................................................................................... + mul v19.4S, v20.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... + sub v20.4S, v13.4S, v17.4S // ...................................................................................................................................e.................................................................................................................................................... + mls v27.4S, v28.4S, v29.4S // .................................................................................................................................................e...................................................................................................................................... + add v17.4S, v13.4S, v17.4S // ....................................................................................................................................e................................................................................................................................................... + sqrdmulh v13.4S, v23.4S, v0.S[1] // ...............................................................................................................................................................e........................................................................................................................ + sqrdmulh v28.4S, v20.4S, v1.S[1] // ......................................................................................................................................e................................................................................................................................................. + mls v19.4S, v11.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. + mul v20.4S, v20.4S, v1.S[0] // .....................................................................................................................................e.................................................................................................................................................. + + // original source code + // ldr q8, [x1, #0] // .....................................................................e......................................................................................................................|................................................................................................................................................................e................. + // ldr q9, [x1, #(1*(512/8))] // .................................................................e..........................................................................................................................|............................................................................................................................................................e..................... + // ldr q10, [x1, #(2*(512/8))] // ......................................................e.....................................................................................................................................|.................................................................................................................................................e................................ + // ldr q11, [x1, #(3*(512/8))] // ..............................................................e.............................................................................................................................|.........................................................................................................................................................e........................ + // ldr q12, [x1, #(4*(512/8))] // ....................e.......................................................................................................................................................................|...............................................................................................................e.................................................................. + // ldr q13, [x1, #(5*(512/8))] // e...........................................................................................................................................................................................|...........................................................................................e...................................................................................... + // ldr q14, [x1, #(6*(512/8))] // ............................e...............................................................................................................................................................|.......................................................................................................................e.......................................................... + // ldr q15, [x1, #(7*(512/8))] // ..............................e.............................................................................................................................................................|.........................................................................................................................e........................................................ + // ldr q16, [x1, #(8*(512/8))] // .................................e..........................................................................................................................................................|............................................................................................................................e..................................................... + // ldr q17, [x1, #(9*(512/8))] // ......................................e.....................................................................................................................................................|.................................................................................................................................e................................................ + // ldr q18, [x1, #(10*(512/8))] // .............................................................................e..............................................................................................................|........................................................................................................................................................................e......... + // ldr q19, [x1, #(11*(512/8))] // ..............................................................................e.............................................................................................................|.........................................................................................................................................................................e........ + // ldr q20, [x1, #(12*(512/8))] // ................................................................................................................e...........................................................................|.................................................................................................................................................................................. + // ldr q21, [x1, #(13*(512/8))] // ..........................................................................................................e.................................................................................|.................................................................................................................................................................................. + // ldr q22, [x1, #(14*(512/8))] // ..................................................e.........................................................................................................................................|.............................................................................................................................................e.................................... + // ldr q23, [x1, #(15*(512/8))] // ...................................................e........................................................................................................................................|..............................................................................................................................................e................................... + // sub v24.4s, v8.4s, v9.4s // ...............................................................................e............................................................................................................|..........................................................................................................................................................................e....... + // add v8.4s, v8.4s, v9.4s // ..................................................................................e.........................................................................................................|.............................................................................................................................................................................e.... + // mul v9.4s, v24.4s, v3.s[2] // ....................................................................................e.......................................................................................................|...............................................................................................................................................................................e.. + // sqrdmulh v24.4s, v24.4s, v3.s[3] // ...................................................................................e........................................................................................................|..............................................................................................................................................................................e... + // mls v9.4s, v24.4s, v29.4s // .............................................................................................e..............................................................................................|.................................................................................................................................................................................. + // sub v24.4s, v10.4s, v11.4s // ..............................................................................................e.............................................................................................|.................................................................................................................................................................................. + // add v10.4s, v10.4s, v11.4s // ............................................................................e...............................................................................................................|.......................................................................................................................................................................e.......... + // mul v11.4s, v24.4s, v4.s[0] // .................................................................................................e..........................................................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v4.s[1] // ..................................................................................................e.........................................................................................|.................................................................................................................................................................................. + // mls v11.4s, v24.4s, v29.4s // .......................................................................................................e....................................................................................|.................................................................................................................................................................................. + // sub v24.4s, v12.4s, v13.4s // .....................................e......................................................................................................................................................|................................................................................................................................e................................................. + // add v12.4s, v12.4s, v13.4s // ...................................e........................................................................................................................................................|..............................................................................................................................e................................................... + // mul v13.4s, v24.4s, v4.s[2] // .................................................e..........................................................................................................................................|............................................................................................................................................e..................................... + // sqrdmulh v24.4s, v24.4s, v4.s[3] // ................................................e...........................................................................................................................................|...........................................................................................................................................e...................................... + // mls v13.4s, v24.4s, v29.4s // .........................................................e..................................................................................................................................|....................................................................................................................................................e............................. + // sub v24.4s, v14.4s, v15.4s // ...........................................e................................................................................................................................................|......................................................................................................................................e........................................... + // add v14.4s, v14.4s, v15.4s // .........................................e..................................................................................................................................................|....................................................................................................................................e............................................. + // mul v15.4s, v24.4s, v5.s[0] // .......................................................e....................................................................................................................................|..................................................................................................................................................e............................... + // sqrdmulh v24.4s, v24.4s, v5.s[1] // ........................................................e...................................................................................................................................|...................................................................................................................................................e.............................. + // mls v15.4s, v24.4s, v29.4s // ................................................................e...........................................................................................................................|...........................................................................................................................................................e...................... + // sub v24.4s, v16.4s, v17.4s // ...................................................................e........................................................................................................................|..............................................................................................................................................................e................... + // add v16.4s, v16.4s, v17.4s // ...............................................e............................................................................................................................................|..........................................................................................................................................e....................................... + // mul v17.4s, v24.4s, v5.s[2] // .............................................................................................................e..............................................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v5.s[3] // ...........................................................................................................e................................................................................|.................................................................................................................................................................................. + // mls v17.4s, v24.4s, v29.4s // ....................................................................................................................e.......................................................................|.................................................................................................................................................................................. + // sub v24.4s, v18.4s, v19.4s // .......................................................................................e....................................................................................................|.................................................................................................................................................................................. + // add v18.4s, v18.4s, v19.4s // .........................................................................................e..................................................................................................|.................................................................................................................................................................................. + // mul v19.4s, v24.4s, v6.s[0] // ...................................................................................................e........................................................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v6.s[1] // ................................................................................................e...........................................................................................|.................................................................................................................................................................................. + // mls v19.4s, v24.4s, v29.4s // ...............................................................................................................e............................................................................|.................................................................................................................................................................................. + // sub v24.4s, v20.4s, v21.4s // .......................................................................................................................e....................................................................|.................................................................................................................................................................................. + // add v20.4s, v20.4s, v21.4s // ...............................................................................................................................e............................................................|.................................................................................................................................................................................. + // mul v21.4s, v24.4s, v6.s[2] // ...........................................................................................................................e................................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v6.s[3] // ............................................................................................................................e...............................................................|.................................................................................................................................................................................. + // mls v21.4s, v24.4s, v29.4s // .................................................................................................................................e..........................................................|.................................................................................................................................................................................. + // sub v24.4s, v22.4s, v23.4s // ...........................................................e................................................................................................................................|......................................................................................................................................................e........................... + // add v22.4s, v22.4s, v23.4s // ............................................................e...............................................................................................................................|.......................................................................................................................................................e.......................... + // mul v23.4s, v24.4s, v7.s[0] // .....................................................................................................e......................................................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v7.s[1] // ....................................................................................................e.......................................................................................|.................................................................................................................................................................................. + // mls v23.4s, v24.4s, v29.4s // ............................................................................................................e...............................................................................|.................................................................................................................................................................................. + // sub v24.4s, v8.4s, v10.4s // ..........................................................................................e.................................................................................................|.................................................................................................................................................................................. + // add v8.4s, v8.4s, v10.4s // ............................................................................................e...............................................................................................|.................................................................................................................................................................................. + // mul v10.4s, v24.4s, v1.s[2] // ..................................................................................................................e.........................................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...................................................................................................................e........................................................................|.................................................................................................................................................................................. + // mls v10.4s, v24.4s, v29.4s // ........................................................................................................................e...................................................................|.................................................................................................................................................................................. + // sub v24.4s, v9.4s, v11.4s // ........................................................................................................e...................................................................................|.................................................................................................................................................................................. + // add v9.4s, v9.4s, v11.4s // .........................................................................................................e..................................................................................|.................................................................................................................................................................................. + // mul v11.4s, v24.4s, v1.s[2] // ......................................................................................................................e.....................................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................................................................................................................e......................................................................|.................................................................................................................................................................................. + // mls v11.4s, v24.4s, v29.4s // .............................................................................................................................e..............................................................|.................................................................................................................................................................................. + // sub v24.4s, v12.4s, v14.4s // ..................................................................e.........................................................................................................................|.............................................................................................................................................................e.................... + // add v12.4s, v12.4s, v14.4s // ..............................................e.............................................................................................................................................|.........................................................................................................................................e........................................ + // mul v14.4s, v24.4s, v2.s[0] // ...............................................................................................e............................................................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ........................................................................................e...................................................................................................|.................................................................................................................................................................................. + // mls v14.4s, v24.4s, v29.4s // ......................................................................................................e.....................................................................................|.................................................................................................................................................................................. + // sub v24.4s, v13.4s, v15.4s // .........................................................................e..................................................................................................................|....................................................................................................................................................................e............. + // add v13.4s, v13.4s, v15.4s // ........................................................................e...................................................................................................................|...................................................................................................................................................................e.............. + // mul v15.4s, v24.4s, v2.s[0] // .....................................................................................e......................................................................................................|................................................................................................................................................................................e. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................................................................................e...........................................................................................................|...........................................................................................................................................................................e...... + // mls v15.4s, v24.4s, v29.4s // ...........................................................................................e................................................................................................|.................................................................................................................................................................................. + // sub v24.4s, v16.4s, v18.4s // ......................................................................................................................................e.....................................................|.................................................................................................................................................................................. + // add v16.4s, v16.4s, v18.4s // .......................................................................................................................................e....................................................|.................................................................................................................................................................................. + // mul v18.4s, v24.4s, v2.s[2] // .............................................................................................................................................................e..............................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..............................................................................................................................................................e.............................|.................................................................................................................................................................................. + // mls v18.4s, v24.4s, v29.4s // .................................................................................................................................................................e..........................|.................................................................................................................................................................................. + // sub v24.4s, v17.4s, v19.4s // .........................................................................................................................e..................................................................|.................................................................................................................................................................................. + // add v17.4s, v17.4s, v19.4s // ..........................................................................................................................e.................................................................|.................................................................................................................................................................................. + // mul v19.4s, v24.4s, v2.s[2] // ..............................................................................................................................................e.............................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...............................................................................................................................................e............................................|.................................................................................................................................................................................. + // mls v19.4s, v24.4s, v29.4s // ......................................................................................................................................................e.....................................|.................................................................................................................................................................................. + // sub v24.4s, v20.4s, v22.4s // ................................................................................................................................................e...........................................|.................................................................................................................................................................................. + // add v20.4s, v20.4s, v22.4s // .................................................................................................................................................e..........................................|.................................................................................................................................................................................. + // mul v22.4s, v24.4s, v3.s[0] // ....................................................................................................................................................................................e.......|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .................................................................................................................................................................................e..........|.................................................................................................................................................................................. + // mls v22.4s, v24.4s, v29.4s // ..........................................................................................................................................................................................e.|.................................................................................................................................................................................. + // sub v24.4s, v21.4s, v23.4s // .....................................................................................................................................e......................................................|.................................................................................................................................................................................. + // add v21.4s, v21.4s, v23.4s // ....................................................................................................................................e.......................................................|.................................................................................................................................................................................. + // mul v23.4s, v24.4s, v3.s[0] // ........................................................................................................................................................................e...................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .........................................................................................................................................................................e..................|.................................................................................................................................................................................. + // mls v23.4s, v24.4s, v29.4s // ...............................................................................................................................................................................e............|.................................................................................................................................................................................. + // sub v24.4s, v8.4s, v12.4s // ................................................................................................................................e...........................................................|.................................................................................................................................................................................. + // add v8.4s, v8.4s, v12.4s // ..............................................................................................................................e.............................................................|.................................................................................................................................................................................. + // mul v12.4s, v24.4s, v0.s[2] // ..................................................................................................................................................................e.........................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....................................................................................................................................................................e......................|.................................................................................................................................................................................. + // mls v12.4s, v24.4s, v29.4s // ..........................................................................................................................................................................e.................|.................................................................................................................................................................................. + // sub v24.4s, v9.4s, v13.4s // ..............................................................................................................e.............................................................................|.................................................................................................................................................................................. + // add v9.4s, v9.4s, v13.4s // .................................................................................................................e..........................................................................|.................................................................................................................................................................................. + // mul v13.4s, v24.4s, v0.s[2] // .....................................................................................................................................................e......................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................................................................................................................e..................................|.................................................................................................................................................................................. + // mls v13.4s, v24.4s, v29.4s // ................................................................................................................................................................e...........................|.................................................................................................................................................................................. + // sub v24.4s, v10.4s, v14.4s // ...................................................................................................................................e........................................................|.................................................................................................................................................................................. + // add v10.4s, v10.4s, v14.4s // ..................................................................................................................................e.........................................................|.................................................................................................................................................................................. + // mul v14.4s, v24.4s, v0.s[2] // ..........................................................................................................................................e.................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................................................................................................................................e................................................|.................................................................................................................................................................................. + // mls v14.4s, v24.4s, v29.4s // ....................................................................................................................................................e.......................................|.................................................................................................................................................................................. + // sub v24.4s, v11.4s, v15.4s // ........................................................................................................................................e...................................................|.................................................................................................................................................................................. + // add v11.4s, v11.4s, v15.4s // .........................................................................................................................................e..................................................|.................................................................................................................................................................................. + // mul v15.4s, v24.4s, v0.s[2] // .......................................................................................................................................................e....................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..........................................................................................................................................................e.................................|.................................................................................................................................................................................. + // mls v15.4s, v24.4s, v29.4s // ...............................................................................................................................................................e............................|.................................................................................................................................................................................. + // sub v24.4s, v16.4s, v20.4s // ...................................................................................................................................................................e........................|.................................................................................................................................................................................. + // add v16.4s, v16.4s, v20.4s // ....................................................................................................................................................................e.......................|.................................................................................................................................................................................. + // mul v20.4s, v24.4s, v1.s[0] // .......................................................................................................................................................................e....................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................................................................................................................................................................e.....................|.................................................................................................................................................................................. + // mls v20.4s, v24.4s, v29.4s // ............................................................................................................................................................................e...............|.................................................................................................................................................................................. + // sub v24.4s, v17.4s, v21.4s // .............................................................................................................................................e..............................................|.................................................................................................................................................................................. + // add v17.4s, v17.4s, v21.4s // ............................................................................................................................................e...............................................|.................................................................................................................................................................................. + // mul v21.4s, v24.4s, v1.s[0] // ..................................................................................................................................................e.........................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................................................................e........................................|.................................................................................................................................................................................. + // mls v21.4s, v24.4s, v29.4s // ........................................................................................................................................................e...................................|.................................................................................................................................................................................. + // sub v24.4s, v18.4s, v22.4s // ............................................................................................................................................................................................|...*.............................................................................................................................................................................. + // add v18.4s, v18.4s, v22.4s // ............................................................................................................................................................................................|.....*............................................................................................................................................................................ + // mul v22.4s, v24.4s, v1.s[0] // ............................................................................................................................................................................................|.....................*............................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ............................................................................................................................................................................................|............................*..................................................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|..................................*............................................................................................................................................... + // sub v24.4s, v19.4s, v23.4s // .....................................................................................................................................................................................e......|.................................................................................................................................................................................. + // add v19.4s, v19.4s, v23.4s // .......................................................................................................................................................................................e....|.................................................................................................................................................................................. + // mul v23.4s, v24.4s, v1.s[0] // ...........................................................................................................................................................................................e|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................................................................................................................................................................e..|.................................................................................................................................................................................. + // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................................*.................................................................................................................................................................................. + // sub v24.4s, v8.4s, v16.4s // ...........................................................................................................................................................................e................|.................................................................................................................................................................................. + // add v8.4s, v8.4s, v16.4s // .............................................................................................................................................................................e..............|.................................................................................................................................................................................. + // mul v16.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|...............*.................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|.................*................................................................................................................................................................ + // mls v16.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|......................*........................................................................................................................................................... + // sub v24.4s, v9.4s, v17.4s // ............................................................................................................................................................e...............................|.................................................................................................................................................................................. + // add v9.4s, v9.4s, v17.4s // ...........................................................................................................................................................e................................|.................................................................................................................................................................................. + // mul v17.4s, v24.4s, v0.s[0] // ................................................................................................................................................................................e...........|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................................................................................................................................................e.............|.................................................................................................................................................................................. + // mls v17.4s, v24.4s, v29.4s // ......................................................................................................................................................................................e.....|.................................................................................................................................................................................. + // sub v24.4s, v10.4s, v18.4s // ............................................................................................................................................................................................|.............................................*.................................................................................................................................... + // add v10.4s, v10.4s, v18.4s // ............................................................................................................................................................................................|..............................................*................................................................................................................................... + // mul v18.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|................................................*................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|.................................................*................................................................................................................................ + // mls v18.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|.........................................................*........................................................................................................................ + // sub v24.4s, v11.4s, v19.4s // ............................................................................................................................................................................................|..*............................................................................................................................................................................... + // add v11.4s, v11.4s, v19.4s // ............................................................................................................................................................................................|.*................................................................................................................................................................................ + // mul v19.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|..................*............................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|...................*.............................................................................................................................................................. + // mls v19.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|........................*......................................................................................................................................................... + // sub v24.4s, v12.4s, v20.4s // ..................................................................................................................................................................................e.........|.................................................................................................................................................................................. + // add v12.4s, v12.4s, v20.4s // ...................................................................................................................................................................................e........|.................................................................................................................................................................................. + // mul v20.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|*................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................................................................................................................................................e...|.................................................................................................................................................................................. + // mls v20.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|......*........................................................................................................................................................................... + // sub v24.4s, v13.4s, v21.4s // ............................................................................................................................................................................................|....*............................................................................................................................................................................. + // add v13.4s, v13.4s, v21.4s // ............................................................................................................................................................................................|.........*........................................................................................................................................................................ + // mul v21.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|........*......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|.......*.......................................................................................................................................................................... + // mls v21.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|.............*.................................................................................................................................................................... + // sub v24.4s, v14.4s, v22.4s // ............................................................................................................................................................................................|........................................*......................................................................................................................................... + // add v14.4s, v14.4s, v22.4s // ............................................................................................................................................................................................|.........................................*........................................................................................................................................ + // mul v22.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|..................................................*............................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|............................................*..................................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|........................................................*......................................................................................................................... + // sub v24.4s, v15.4s, v23.4s // ............................................................................................................................................................................................|...........*...................................................................................................................................................................... + // add v15.4s, v15.4s, v23.4s // ............................................................................................................................................................................................|..........*....................................................................................................................................................................... + // mul v23.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|...................................................*.............................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|......................................................*........................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|...................................................................*.............................................................................................................. + // cmge v27.4s, v31.4s, v16.4s // ............................................................................................................................................................................................|................................*................................................................................................................................................. + // cmge v28.4s, v16.4s, v30.4s // ............................................................................................................................................................................................|......................................*........................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|...........................................*...................................................................................................................................... + // mls v16.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|...............................................*.................................................................................................................................. + // cmge v27.4s, v31.4s, v17.4s // ............................................................................................................................................................................................|.......................*.......................................................................................................................................................... + // cmge v28.4s, v17.4s, v30.4s // ............................................................................................................................................................................................|.........................*........................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|...........................*...................................................................................................................................................... + // mls v17.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|...............................*.................................................................................................................................................. + // cmge v27.4s, v31.4s, v18.4s // ............................................................................................................................................................................................|...............................................................*.................................................................................................................. + // cmge v28.4s, v18.4s, v30.4s // ............................................................................................................................................................................................|..............................................................*................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|..................................................................*............................................................................................................... + // mls v18.4s, v28.4s, v29.4s // ...................*........................................................................................................................................................................|..............................................................................................................*................................................................... + // cmge v27.4s, v31.4s, v19.4s // ............................................................................................................................................................................................|.....................................................*............................................................................................................................ + // cmge v28.4s, v19.4s, v30.4s // ............................................................................................................................................................................................|.....................................................................................*............................................................................................ + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|.........................................................................................*........................................................................................ + // mls v19.4s, v28.4s, v29.4s // ................*...........................................................................................................................................................................|...........................................................................................................*...................................................................... + // cmge v27.4s, v31.4s, v20.4s // ............................................................................................................................................................................................|..............*................................................................................................................................................................... + // cmge v28.4s, v20.4s, v30.4s // ............................................................................................................................................................................................|............*..................................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|................*................................................................................................................................................................. + // mls v20.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|....................*............................................................................................................................................................. + // cmge v27.4s, v31.4s, v21.4s // ............................................................................................................................................................................................|..............................*................................................................................................................................................... + // cmge v28.4s, v21.4s, v30.4s // ............................................................................................................................................................................................|.............................*.................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|.................................*................................................................................................................................................ + // mls v21.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|.....................................*............................................................................................................................................ + // cmge v27.4s, v31.4s, v22.4s // ............................................................................................................................................................................................|.....................................................................*............................................................................................................ + // cmge v28.4s, v22.4s, v30.4s // ............................................................................................................................................................................................|.........................................................................*........................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|............................................................................*..................................................................................................... + // mls v22.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|.................................................................................*................................................................................................ + // cmge v27.4s, v31.4s, v23.4s // ............................................................................................................................................................................................|.............................................................................*.................................................................................................... + // cmge v28.4s, v23.4s, v30.4s // ............................................................................................................................................................................................|.......................................................................................*.......................................................................................... + // sub v28.4s, v27.4s, v28.4s // ..*.........................................................................................................................................................................................|.............................................................................................*.................................................................................... + // mls v23.4s, v28.4s, v29.4s // ......*.....................................................................................................................................................................................|.................................................................................................*................................................................................ + // str q16, [x1, #(8*(512/8))] // ............................................................................................................................................................................................|....................................................*............................................................................................................................. + // str q17, [x1, #(9*(512/8))] // ............................................................................................................................................................................................|.......................................*.......................................................................................................................................... + // str q18, [x1, #(10*(512/8))] // .......................*....................................................................................................................................................................|..................................................................................................................*............................................................... + // str q19, [x1, #(11*(512/8))] // .........................*..................................................................................................................................................................|....................................................................................................................*............................................................. + // str q20, [x1, #(12*(512/8))] // ............................................................................................................................................................................................|..........................*....................................................................................................................................................... + // str q21, [x1, #(13*(512/8))] // ............................................................................................................................................................................................|.......................................................*.......................................................................................................................... + // str q22, [x1, #(14*(512/8))] // ............................................................................................................................................................................................|......................................................................................*........................................................................................... + // str q23, [x1, #(15*(512/8))] // ..........*.................................................................................................................................................................................|.....................................................................................................*............................................................................ + // mul v16.4s, v8.4s, v25.4s // .*..........................................................................................................................................................................................|............................................................................................*..................................................................................... + // sqrdmulh v8.4s, v8.4s, v26.4s // ............................................................................................................................................................................................|............................................................*..................................................................................................................... + // mls v16.4s, v8.4s, v29.4s // .......*....................................................................................................................................................................................|..................................................................................................*............................................................................... + // mul v17.4s, v9.4s, v25.4s // ............................................................................................................................................................................................|...................................*.............................................................................................................................................. + // sqrdmulh v9.4s, v9.4s, v26.4s // ............................................................................................................................................................................................|....................................*............................................................................................................................................. + // mls v17.4s, v9.4s, v29.4s // ............................................................................................................................................................................................|..........................................*....................................................................................................................................... + // mul v18.4s, v10.4s, v25.4s // ............................................................................................................................................................................................|..........................................................................................*....................................................................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ............................................................................................................................................................................................|........................................................................................*......................................................................................... + // mls v18.4s, v10.4s, v29.4s // .....*......................................................................................................................................................................................|................................................................................................*................................................................................. + // mul v19.4s, v11.4s, v25.4s // ............*...............................................................................................................................................................................|.......................................................................................................*.......................................................................... + // sqrdmulh v11.4s, v11.4s, v26.4s // ........*...................................................................................................................................................................................|...................................................................................................*.............................................................................. + // mls v19.4s, v11.4s, v29.4s // ..................*.........................................................................................................................................................................|.............................................................................................................*.................................................................... + // mul v20.4s, v12.4s, v25.4s // ...........*................................................................................................................................................................................|......................................................................................................*........................................................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ............................................................................................................................................................................................|..................................................................................*............................................................................................... + // mls v20.4s, v12.4s, v29.4s // ...............................*............................................................................................................................................................|..........................................................................................................................*....................................................... + // mul v21.4s, v13.4s, v25.4s // ............................................................................................................................................................................................|..........................................................*....................................................................................................................... + // sqrdmulh v13.4s, v13.4s, v26.4s // ............................................................................................................................................................................................|...........................................................*...................................................................................................................... + // mls v21.4s, v13.4s, v29.4s // ............................................................................................................................................................................................|................................................................*................................................................................................................. + // mul v22.4s, v14.4s, v25.4s // ............................................................................................................................................................................................|................................................................................*................................................................................................. + // sqrdmulh v14.4s, v14.4s, v26.4s // ............................................................................................................................................................................................|...........................................................................*...................................................................................................... + // mls v22.4s, v14.4s, v29.4s // ............................................................................................................................................................................................|....................................................................................*............................................................................................. + // mul v23.4s, v15.4s, v25.4s // ...*........................................................................................................................................................................................|..............................................................................................*................................................................................... + // sqrdmulh v15.4s, v15.4s, v26.4s // ....*.......................................................................................................................................................................................|...............................................................................................*.................................................................................. + // mls v23.4s, v15.4s, v29.4s // .........*..................................................................................................................................................................................|....................................................................................................*............................................................................. + // cmge v27.4s, v31.4s, v16.4s // .............*..............................................................................................................................................................................|........................................................................................................*......................................................................... + // cmge v28.4s, v16.4s, v30.4s // .....................*......................................................................................................................................................................|................................................................................................................*................................................................. + // sub v28.4s, v27.4s, v28.4s // .............................*..............................................................................................................................................................|........................................................................................................................*......................................................... + // mls v16.4s, v28.4s, v29.4s // ..................................*.........................................................................................................................................................|.............................................................................................................................*.................................................... + // cmge v27.4s, v31.4s, v17.4s // ............................................................................................................................................................................................|.................................................................*................................................................................................................ + // cmge v28.4s, v17.4s, v30.4s // ............................................................................................................................................................................................|.............................................................*.................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|....................................................................*............................................................................................................. + // mls v17.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|........................................................................*......................................................................................................... + // cmge v27.4s, v31.4s, v18.4s // ......................................................................*.....................................................................................................................|.................................................................................................................................................................*................ + // cmge v28.4s, v18.4s, v30.4s // .......................................................................*....................................................................................................................|..................................................................................................................................................................*............... + // sub v28.4s, v27.4s, v28.4s // ...........................................................................*................................................................................................................|......................................................................................................................................................................*........... + // mls v18.4s, v28.4s, v29.4s // .................................................................................*..........................................................................................................|............................................................................................................................................................................*..... + // cmge v27.4s, v31.4s, v19.4s // ........................*...................................................................................................................................................................|...................................................................................................................*.............................................................. + // cmge v28.4s, v19.4s, v30.4s // ......................*.....................................................................................................................................................................|.................................................................................................................*................................................................ + // sub v28.4s, v27.4s, v28.4s // .....................................................*......................................................................................................................................|................................................................................................................................................*................................. + // mls v19.4s, v28.4s, v29.4s // ..........................................................*.................................................................................................................................|.....................................................................................................................................................*............................ + // cmge v27.4s, v31.4s, v20.4s // .......................................*....................................................................................................................................................|..................................................................................................................................*............................................... + // cmge v28.4s, v20.4s, v30.4s // ........................................*...................................................................................................................................................|...................................................................................................................................*.............................................. + // sub v28.4s, v27.4s, v28.4s // ............................................*...............................................................................................................................................|.......................................................................................................................................*.......................................... + // mls v20.4s, v28.4s, v29.4s // ...............................................................*............................................................................................................................|..........................................................................................................................................................*....................... + // cmge v27.4s, v31.4s, v21.4s // ............................................................................................................................................................................................|.......................................................................*.......................................................................................................... + // cmge v28.4s, v21.4s, v30.4s // ............................................................................................................................................................................................|......................................................................*........................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|..........................................................................*....................................................................................................... + // mls v21.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|..............................................................................*................................................................................................... + // cmge v27.4s, v31.4s, v22.4s // ..........................*.................................................................................................................................................................|.....................................................................................................................*............................................................ + // cmge v28.4s, v22.4s, v30.4s // ..............*.............................................................................................................................................................................|.........................................................................................................*........................................................................ + // sub v28.4s, v27.4s, v28.4s // ................................*...........................................................................................................................................................|...........................................................................................................................*...................................................... + // mls v22.4s, v28.4s, v29.4s // ....................................*.......................................................................................................................................................|...............................................................................................................................*.................................................. + // cmge v27.4s, v31.4s, v23.4s // .................*..........................................................................................................................................................................|............................................................................................................*..................................................................... + // cmge v28.4s, v23.4s, v30.4s // ...............*............................................................................................................................................................................|..........................................................................................................*....................................................................... + // sub v28.4s, v27.4s, v28.4s // ...........................*................................................................................................................................................................|......................................................................................................................*........................................................... + // mls v23.4s, v28.4s, v29.4s // ....................................................*.......................................................................................................................................|...............................................................................................................................................*.................................. + // str q16, [x1], #(16) // ..........................................*.................................................................................................................................................|.....................................................................................................................................*............................................ + // str q17, [x1, #(-16 + 1*(512/8))] // ............................................................................................................................................................................................|...............................................................................*.................................................................................................. + // str q18, [x1, #(-16 + 2*(512/8))] // ......................................................................................*.....................................................................................................|.................................................................................................................................................................................* + // str q19, [x1, #(-16 + 3*(512/8))] // ....................................................................*.......................................................................................................................|...............................................................................................................................................................*.................. + // str q20, [x1, #(-16 + 4*(512/8))] // ..........................................................................*.................................................................................................................|.....................................................................................................................................................................*............ + // str q21, [x1, #(-16 + 5*(512/8))] // ............................................................................................................................................................................................|...................................................................................*.............................................................................................. + // str q22, [x1, #(-16 + 6*(512/8))] // .............................................*..............................................................................................................................................|........................................................................................................................................*......................................... + // str q23, [x1, #(-16 + 7*(512/8))] // .............................................................*..............................................................................................................................|........................................................................................................................................................*......................... + + sub count, count, #1 + cbnz count, layer1234_start + mls v20.4S, v28.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mul v11.4S, v23.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + add v28.4S, v18.4S, v19.4S // ...............................................................................................................................*........................................................................................................................................................ + sub v23.4S, v18.4S, v19.4S // ..............................................................................................................................*......................................................................................................................................................... + cmge v19.4S, v27.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + cmge v18.4S, v31.4S, v27.4S // ....................................................................................................................................................................................*................................................................................................... + mls v11.4S, v13.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + sub v13.4S, v10.4S, v28.4S // ..................................................................................................................................................*..................................................................................................................................... + add v10.4S, v10.4S, v28.4S // ...................................................................................................................................................*.................................................................................................................................... + sub v28.4S, v18.4S, v19.4S // ......................................................................................................................................................................................*................................................................................................. + mul v19.4S, v13.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v13.4S, v13.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + sqrdmulh v18.4S, v23.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + mul v23.4S, v23.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + mls v27.4S, v28.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sub v28.4S, v24.4S, v15.4S // .................................................................................................................................................................*...................................................................................................................... + mls v19.4S, v13.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + add v13.4S, v24.4S, v15.4S // ..................................................................................................................................................................*..................................................................................................................... + cmge v24.4S, v31.4S, v11.4S // ................................................................................................................................................................................................*....................................................................................... + mls v23.4S, v18.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + sub v18.4S, v16.4S, v20.4S // ...........................................................................................................................................................................*............................................................................................................ + add v15.4S, v16.4S, v20.4S // ............................................................................................................................................................................*........................................................................................................... + cmge v20.4S, v19.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + str q27, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + cmge v27.4S, v31.4S, v19.4S // ........................................................................................................................................................................................*............................................................................................... + mul v16.4S, v22.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + sqrdmulh v22.4S, v22.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + sub v20.4S, v27.4S, v20.4S // ..........................................................................................................................................................................................*............................................................................................. + mls v16.4S, v22.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + cmge v22.4S, v11.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v19.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + add v27.4S, v14.4S, v23.4S // .......................................................................................................................................................................*................................................................................................................ + sub v20.4S, v14.4S, v23.4S // ......................................................................................................................................................................*................................................................................................................. + sub v23.4S, v24.4S, v22.4S // ..................................................................................................................................................................................................*..................................................................................... + cmge v14.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + cmge v24.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + str q19, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + sub v22.4S, v21.4S, v17.4S // .......................................................................................................................................................*................................................................................................................................ + mls v11.4S, v23.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + sub v24.4S, v24.4S, v14.4S // ..................................................................................................................................................................................*..................................................................................................... + sqrdmulh v14.4S, v20.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mul v19.4S, v22.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v23.4S, v22.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + str q11, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + add v11.4S, v21.4S, v17.4S // ........................................................................................................................................................*............................................................................................................................... + mls v16.4S, v24.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + mul v21.4S, v28.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + mul v24.4S, v20.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + mls v19.4S, v23.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + mul v23.4S, v18.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sqrdmulh v22.4S, v18.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + sqrdmulh v18.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + mls v24.4S, v14.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + cmge v17.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... + sqrdmulh v16.4S, v28.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + cmge v28.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + cmge v14.4S, v24.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v12.4S, v31.4S, v24.4S // ........................................................................................................................................................................................................*............................................................................... + sub v28.4S, v17.4S, v28.4S // ..............................................................................................................................................................................................*......................................................................................... + mls v23.4S, v22.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + sub v17.4S, v12.4S, v14.4S // ..........................................................................................................................................................................................................*............................................................................. + mls v21.4S, v16.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + mls v19.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + sqrdmulh v14.4S, v27.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + cmge v28.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + cmge v12.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + cmge v16.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + cmge v22.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................*................................................................................... + mls v24.4S, v17.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + sub v28.4S, v28.4S, v12.4S // ..............................................................................................................................................................................................................*......................................................................... + sub v12.4S, v22.4S, v16.4S // ......................................................................................................................................................................................................*................................................................................. + sqrdmulh v16.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mul v22.4S, v27.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + mls v21.4S, v12.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + str q24, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + sqrdmulh v11.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v22.4S, v14.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + mls v17.4S, v16.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + str q21, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + cmge v12.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + cmge v9.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v24.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + sub v12.4S, v27.4S, v12.4S // ......................................................................................................................................................................................................................................................*................................. + sqrdmulh v27.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + sub v24.4S, v9.4S, v24.4S // ..........................................................................................................................................................................................................................................................................*............. + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + mls v17.4S, v12.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + mls v19.4S, v11.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + mul v8.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sqrdmulh v9.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + mls v16.4S, v27.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + str q17, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + sqrdmulh v13.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + mul v11.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + mls v20.4S, v18.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + mls v8.4S, v9.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + cmge v9.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + cmge v12.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + mls v21.4S, v13.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + sqrdmulh v13.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + sub v9.4S, v12.4S, v9.4S // ..................................................................................................................................................................................................................................................*..................................... + cmge v12.4S, v31.4S, v8.4S // ............................................................................................................................................................................................................................................................................*........... + cmge v18.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + mls v11.4S, v13.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + mls v16.4S, v9.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + cmge v23.4S, v8.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v14.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + cmge v9.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + mls v22.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + sub v28.4S, v12.4S, v23.4S // ..............................................................................................................................................................................................................................................................................*......... + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + sub v15.4S, v14.4S, v9.4S // ..................................................................................................................................................................................................................................................................*..................... + cmge v12.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + mls v8.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + cmge v14.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + cmge v23.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + cmge v28.4S, v11.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + sub v24.4S, v12.4S, v14.4S // ......................................................................................................................................................................................................................................................................*................. + cmge v9.4S, v31.4S, v11.4S // ........................................................................................................................................................................................................................................................*............................... + mls v20.4S, v15.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + sub v13.4S, v23.4S, v18.4S // ..............................................................................................................................................................................................................................................................*......................... + str q8, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + mls v21.4S, v24.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + sub v22.4S, v9.4S, v28.4S // ..........................................................................................................................................................................................................................................................*............................. + mls v19.4S, v13.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + mls v11.4S, v22.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + str q11, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678.s index e505eed..efd5336 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678.s @@ -67,7 +67,7 @@ xtmp1 .req x11 cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s - vmls \a, \tmp2, modulus + vmls \a, \tmp2, consts .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -76,12 +76,6 @@ xtmp1 .req x11 mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.4s, \a\().4s, \b\().4s add \a\().4s, \a\().4s, \b\().4s @@ -193,7 +187,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -204,7 +198,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -214,7 +208,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -222,7 +216,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -233,19 +227,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -371,8 +365,6 @@ _intt_dilithium_123_45678: consts .req v8 qform_consts .req q8 - modulus .req v29 - ASM_LOAD(r_ptr0, roots_l345) ASM_LOAD(r_ptr1, roots_l67) @@ -477,7 +469,7 @@ layer45678_start: ASM_LOAD(xtmp, ninv_tw_addr) ld1r {ninv_tw.4s}, [xtmp] - ushr modulus_half.4S, modulus.4S, #1 + ushr modulus_half.4S, consts.4S, #1 neg neg_modulus_half.4S, modulus_half.4S mov count, #8 diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4.s index 067c375..464e047 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4.s @@ -67,7 +67,7 @@ xtmp1 .req x11 cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s - vmls \a, \tmp2, modulus + vmls \a, \tmp2, consts .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -76,12 +76,6 @@ xtmp1 .req x11 mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.4s, \a\().4s, \b\().4s add \a\().4s, \a\().4s, \b\().4s @@ -193,7 +187,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -204,7 +198,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -214,7 +208,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -222,7 +216,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -233,19 +227,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -371,8 +365,6 @@ _intt_dilithium_123_45678_manual_ld4: consts .req v8 qform_consts .req q8 - modulus .req v29 - ASM_LOAD(r_ptr0, roots_l345) ASM_LOAD(r_ptr1, roots_l67) @@ -486,7 +478,7 @@ layer45678_start: ASM_LOAD(xtmp, ninv_tw_addr) ld1r {ninv_tw.4s}, [xtmp] - ushr modulus_half.4S, modulus.4S, #1 + ushr modulus_half.4S, consts.4S, #1 neg neg_modulus_half.4S, modulus_half.4S mov count, #8 diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a55.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a55.s index 41059cf..cb0727d 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a55.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a55.s @@ -67,7 +67,7 @@ xtmp1 .req x11 cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s - vmls \a, \tmp2, modulus + vmls \a, \tmp2, consts .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -76,12 +76,6 @@ xtmp1 .req x11 mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.4s, \a\().4s, \b\().4s add \a\().4s, \a\().4s, \b\().4s @@ -393,1743 +387,1940 @@ _intt_dilithium_123_45678_manual_ld4_opt_a55: qform_root3_tw .req q7 .p2align 2 - ldr q22, [x4, #48] // ........................* - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q23, [x1, #32] // .....*................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q16, [x1, #0] // ...*..................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q2, [x1, #16] // ....*.................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q25, [x5, #32] // ..*...................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q11, [x5, #48] // ............*............ - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q10, [x5, #80] // ..............*.......... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q19, [x5, #96] // ...............*......... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q6, [x5, #128] // .................*....... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q18, [x5, #144] // ..................*...... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q27, [x4], #64 // .....................*... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q4, [x2, #48] // ...........*............. - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q3, [x5, #64] // .............*........... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q21, [x1, #48] // ......*.................. - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q1, [x5, #112] // ................*........ - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q29, [x5, #160] // ...................*..... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q28, [x4, #-48] // ......................*.. - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q5, [x4, #-32] // .......................*. - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q26, [x2, #0] // .......*................. - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q17, [x2, #16] // .........*............... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q30, [x2, #32] // ..........*.............. - // gap // ......................... - // gap // ......................... - // gap // ......................... - trn1 v20.4S, v23.4S, v21.4S // ........*................ - // gap // ......................... - ldr q24, [x5, #176] // ....................*.... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q7, [x5, #16] // .*....................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q13, [x5], #(12*16) // *........................ - // gap // ......................... + ldr q3, [x4, #48] // ........................* + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q21, [x5, #16] // .*....................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q28, [x5, #32] // ..*...................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q30, [x1, #0] // ...*..................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q12, [x2, #16] // .........*............... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q17, [x2, #32] // ..........*.............. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q7, [x1, #32] // .....*................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q2, [x5, #96] // ...............*......... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q14, [x5, #144] // ..................*...... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q5, [x5, #160] // ...................*..... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q29, [x5, #176] // ....................*.... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q23, [x4], #64 // .....................*... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q31, [x4, #-32] // .......................*. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q25, [x5, #128] // .................*....... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q1, [x1, #48] // ......*.................. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q11, [x5, #64] // .............*........... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q9, [x4, #-48] // ......................*.. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q19, [x5, #80] // ..............*.......... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q24, [x1, #16] // ....*.................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q15, [x5], #(12*16) // *........................ + // gap // ......................... + // gap // ......................... + // gap // ......................... + trn1 v6.4S, v7.4S, v1.4S // ........*................ + // gap // ......................... + ldr q18, [x5, #-144] // ............*............ + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q13, [x2, #0] // .......*................. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q27, [x2, #48] // ...........*............. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q4, [x5, #-80] // ................*........ + // gap // ......................... // original source code - // ldr q13, [x5], #(12*16) // ........................* - // ldr q7, [x5, #-176] // .......................*. - // ldr q25, [x5, #-160] // ....*.................... - // ldr q16, [x1, #0] // ..*...................... - // ldr q2, [x1, #16] // ...*..................... - // ldr q23, [x1, #32] // .*....................... - // ldr q21, [x1, #48] // .............*........... - // ldr q26, [x2, #0] // ..................*...... - // trn1 v20.4S, v23.4S, v21.4S // .....................*... - // ldr q17, [x2, #16] // ...................*..... - // ldr q30, [x2, #32] // ....................*.... - // ldr q4, [x2, #48] // ...........*............. - // ldr q11, [x5, #-144] // .....*................... - // ldr q3, [x5, #-128] // ............*............ - // ldr q10, [x5, #-112] // ......*.................. - // ldr q19, [x5, #-96] // .......*................. - // ldr q1, [x5, #-80] // ..............*.......... - // ldr q6, [x5, #-64] // ........*................ - // ldr q18, [x5, #-48] // .........*............... - // ldr q29, [x5, #-32] // ...............*......... - // ldr q24, [x5, #-16] // ......................*.. - // ldr q27, [x4], #64 // ..........*.............. - // ldr q28, [x4, #-48] // ................*........ - // ldr q5, [x4, #-32] // .................*....... - // ldr q22, [x4, #-16] // *........................ + // ldr q15, [x5], #(12*16) // ...................*..... + // ldr q21, [x5, #-176] // .*....................... + // ldr q28, [x5, #-160] // ..*...................... + // ldr q30, [x1, #0] // ...*..................... + // ldr q24, [x1, #16] // ..................*...... + // ldr q7, [x1, #32] // ......*.................. + // ldr q1, [x1, #48] // ..............*.......... + // ldr q13, [x2, #0] // ......................*.. + // trn1 v6.4S, v7.4S, v1.4S // ....................*.... + // ldr q12, [x2, #16] // ....*.................... + // ldr q17, [x2, #32] // .....*................... + // ldr q27, [x2, #48] // .......................*. + // ldr q18, [x5, #-144] // .....................*... + // ldr q11, [x5, #-128] // ...............*......... + // ldr q19, [x5, #-112] // .................*....... + // ldr q2, [x5, #-96] // .......*................. + // ldr q4, [x5, #-80] // ........................* + // ldr q25, [x5, #-64] // .............*........... + // ldr q14, [x5, #-48] // ........*................ + // ldr q5, [x5, #-32] // .........*............... + // ldr q29, [x5, #-16] // ..........*.............. + // ldr q23, [x4], #64 // ...........*............. + // ldr q9, [x4, #-48] // ................*........ + // ldr q31, [x4, #-32] // ............*............ + // ldr q3, [x4, #-16] // *........................ sub count, count, #1 layer45678_start: - trn1 v0.4S, v16.4S, v2.4S // ....*................................................................................................................................................................. - // gap // ...................................................................................................................................................................... - trn2 v2.4S, v16.4S, v2.4S // .....*................................................................................................................................................................ - // gap // ...................................................................................................................................................................... - trn2 v16.4S, v23.4S, v21.4S // .......*.............................................................................................................................................................. - // gap // ...................................................................................................................................................................... - trn2 v23.2D, v0.2D, v20.2D // ........*............................................................................................................................................................. - // gap // ...................................................................................................................................................................... - trn1 v0.2D, v0.2D, v20.2D // ..........*........................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn2 v21.2D, v2.2D, v16.2D // .........*............................................................................................................................................................ - // gap // ...................................................................................................................................................................... - trn1 v2.2D, v2.2D, v16.2D // ...........*.......................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v16.4S, v23.4S, v21.4S // ...................................*.................................................................................................................................. - // gap // ...................................................................................................................................................................... - add v23.4S, v23.4S, v21.4S // ....................................*................................................................................................................................. - // gap // ...................................................................................................................................................................... - sub v21.4S, v0.4S, v2.4S // ..............................*....................................................................................................................................... - // gap // ...................................................................................................................................................................... - add v0.4S, v0.4S, v2.4S // ...............................*...................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v2.4S, v26.4S, v17.4S // ................*..................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn2 v26.4S, v26.4S, v17.4S // .................*.................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v20.4S, v30.4S, v4.4S // ..................*................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn2 v17.4S, v30.4S, v4.4S // ...................*.................................................................................................................................................. - // gap // ...................................................................................................................................................................... - mul v30.4S, v16.4S, v3.4S // .....................................*................................................................................................................................ - // gap // ...................................................................................................................................................................... - sqrdmulh v16.4S, v16.4S, v10.4S // ......................................*............................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v4.4S, v0.4S, v23.4S // ........................................*............................................................................................................................. - // gap // ...................................................................................................................................................................... - add v0.4S, v0.4S, v23.4S // .........................................*............................................................................................................................ - // gap // ...................................................................................................................................................................... - mul v23.4S, v21.4S, v25.4S // ................................*..................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v21.4S, v21.4S, v11.4S // .................................*.................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn2 v25.2D, v2.2D, v20.2D // ....................*................................................................................................................................................. - // gap // ...................................................................................................................................................................... - trn2 v11.2D, v26.2D, v17.2D // .....................*................................................................................................................................................ - // gap // ...................................................................................................................................................................... - trn1 v2.2D, v2.2D, v20.2D // ......................*............................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v26.2D, v26.2D, v17.2D // .......................*.............................................................................................................................................. - // gap // ...................................................................................................................................................................... - mls v23.4S, v21.4S, v8.S[0] // ..................................*................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v30.4S, v16.4S, v8.S[0] // .......................................*.............................................................................................................................. - // gap // ...................................................................................................................................................................... - mul v16.4S, v4.4S, v13.4S // ..........................................*........................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v21.4S, v4.4S, v7.4S // ...........................................*.......................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v20.4S, v2.4S, v26.4S // ........................................................*............................................................................................................. - // gap // ...................................................................................................................................................................... - sub v17.4S, v23.4S, v30.4S // .............................................*........................................................................................................................ - // gap // ...................................................................................................................................................................... - add v23.4S, v23.4S, v30.4S // ..............................................*....................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v16.4S, v21.4S, v8.S[0] // ............................................*......................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v21.4S, v17.4S, v13.4S // ...............................................*...................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v17.4S, v17.4S, v7.4S // ................................................*..................................................................................................................... - // gap // ...................................................................................................................................................................... - add v2.4S, v2.4S, v26.4S // .........................................................*............................................................................................................ - // gap // ...................................................................................................................................................................... - mul v26.4S, v20.4S, v6.4S // ..........................................................*........................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v30.4S, v0.4S, v23.4S // ............................................................................*......................................................................................... - // gap // ...................................................................................................................................................................... - trn2 v0.4S, v0.4S, v23.4S // .............................................................................*........................................................................................ - // gap // ...................................................................................................................................................................... - mls v21.4S, v17.4S, v8.S[0] // .................................................*.................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v23.4S, v20.4S, v18.4S // ...........................................................*.......................................................................................................... - // gap // ...................................................................................................................................................................... - sub v20.4S, v25.4S, v11.4S // .............................................................*........................................................................................................ - // gap // ...................................................................................................................................................................... - add v17.4S, v25.4S, v11.4S // ..............................................................*....................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v4.4S, v16.4S, v21.4S // ..............................................................................*....................................................................................... - // gap // ...................................................................................................................................................................... - mls v26.4S, v23.4S, v8.S[0] // ............................................................*......................................................................................................... - // gap // ...................................................................................................................................................................... - mul v23.4S, v20.4S, v29.4S // ...............................................................*...................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v20.4S, v20.4S, v24.4S // ................................................................*..................................................................................................... - // gap // ...................................................................................................................................................................... - sub v25.4S, v2.4S, v17.4S // ..................................................................*................................................................................................... - // gap // ...................................................................................................................................................................... - add v2.4S, v2.4S, v17.4S // ...................................................................*.................................................................................................. - // gap // ...................................................................................................................................................................... - trn2 v16.4S, v16.4S, v21.4S // ...............................................................................*...................................................................................... - // gap // ...................................................................................................................................................................... - mls v23.4S, v20.4S, v8.S[0] // .................................................................*.................................................................................................... - // gap // ...................................................................................................................................................................... - mul v21.4S, v25.4S, v19.4S // ....................................................................*................................................................................................. - // gap // ...................................................................................................................................................................... - sqrdmulh v20.4S, v25.4S, v1.4S // .....................................................................*................................................................................................ - // gap // ...................................................................................................................................................................... - trn2 v17.2D, v30.2D, v4.2D // ................................................................................*..................................................................................... - // gap // ...................................................................................................................................................................... - sub v25.4S, v26.4S, v23.4S // .......................................................................*.............................................................................................. - // gap // ...................................................................................................................................................................... - add v23.4S, v26.4S, v23.4S // ........................................................................*............................................................................................. - // gap // ...................................................................................................................................................................... - mls v21.4S, v20.4S, v8.S[0] // ......................................................................*............................................................................................... - // gap // ...................................................................................................................................................................... - mul v26.4S, v25.4S, v19.4S // .........................................................................*............................................................................................ - // gap // ...................................................................................................................................................................... - sqrdmulh v20.4S, v25.4S, v1.4S // ..........................................................................*........................................................................................... - // gap // ...................................................................................................................................................................... - trn2 v25.2D, v0.2D, v16.2D // .................................................................................*.................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v30.2D, v30.2D, v4.2D // ..................................................................................*................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v0.2D, v0.2D, v16.2D // ...................................................................................*.................................................................................. - // gap // ...................................................................................................................................................................... - mls v26.4S, v20.4S, v8.S[0] // ...........................................................................*.......................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v16.4S, v2.4S, v23.4S // ....................................................................................*................................................................................. - // gap // ...................................................................................................................................................................... - trn2 v2.4S, v2.4S, v23.4S // .....................................................................................*................................................................................ - // gap // ...................................................................................................................................................................... - sub v23.4S, v30.4S, v0.4S // ................................................................................................*..................................................................... - // gap // ...................................................................................................................................................................... - trn1 v20.4S, v21.4S, v26.4S // ......................................................................................*............................................................................... - // gap // ...................................................................................................................................................................... - trn2 v21.4S, v21.4S, v26.4S // .......................................................................................*.............................................................................. - // gap // ...................................................................................................................................................................... - add v0.4S, v30.4S, v0.4S // .................................................................................................*.................................................................... - // gap // ...................................................................................................................................................................... - trn2 v26.2D, v16.2D, v20.2D // ........................................................................................*............................................................................. - // gap // ...................................................................................................................................................................... - trn2 v30.2D, v2.2D, v21.2D // .........................................................................................*............................................................................ - // gap // ...................................................................................................................................................................... - trn1 v16.2D, v16.2D, v20.2D // ..........................................................................................*........................................................................... - // gap // ...................................................................................................................................................................... - trn1 v2.2D, v2.2D, v21.2D // ...........................................................................................*.......................................................................... - // gap // ...................................................................................................................................................................... - mul v21.4S, v23.4S, v28.S[2] // ..................................................................................................*................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v23.4S, v23.4S, v28.S[3] // ...................................................................................................*.................................................................. - // gap // ...................................................................................................................................................................... - sub v20.4S, v17.4S, v25.4S // .....................................................................................................*................................................................ - // gap // ...................................................................................................................................................................... - add v17.4S, v17.4S, v25.4S // ......................................................................................................*............................................................... - // gap // ...................................................................................................................................................................... - sub v4.4S, v16.4S, v2.4S // ..........................................................................................................*........................................................... - // gap // ...................................................................................................................................................................... - mls v21.4S, v23.4S, v8.S[0] // ....................................................................................................*................................................................. - // gap // ...................................................................................................................................................................... - mul v23.4S, v20.4S, v5.S[0] // .......................................................................................................*.............................................................. - // gap // ...................................................................................................................................................................... - sqrdmulh v20.4S, v20.4S, v5.S[1] // ........................................................................................................*............................................................. - // gap // ...................................................................................................................................................................... - add v2.4S, v16.4S, v2.4S // ...........................................................................................................*.......................................................... - // gap // ...................................................................................................................................................................... - mul v16.4S, v4.4S, v5.S[2] // ............................................................................................................*......................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v4.4S, v4.4S, v5.S[3] // .............................................................................................................*........................................................ - // gap // ...................................................................................................................................................................... - mls v23.4S, v20.4S, v8.S[0] // .........................................................................................................*............................................................ - // gap // ...................................................................................................................................................................... - sub v20.4S, v26.4S, v30.4S // ...............................................................................................................*...................................................... - // gap // ...................................................................................................................................................................... - add v26.4S, v26.4S, v30.4S // ................................................................................................................*..................................................... - // gap // ...................................................................................................................................................................... - mls v16.4S, v4.4S, v8.S[0] // ..............................................................................................................*....................................................... - // gap // ...................................................................................................................................................................... - mul v30.4S, v20.4S, v22.S[0] // .................................................................................................................*.................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v20.4S, v20.4S, v22.S[1] // ..................................................................................................................*................................................... - // gap // ...................................................................................................................................................................... - sub v4.4S, v0.4S, v17.4S // ....................................................................................................................*................................................. - // gap // ...................................................................................................................................................................... - add v0.4S, v0.4S, v17.4S // .....................................................................................................................*................................................ - // gap // ...................................................................................................................................................................... - sub v17.4S, v21.4S, v23.4S // .........................................................................................................................*............................................ - // gap // ...................................................................................................................................................................... - mls v30.4S, v20.4S, v8.S[0] // ...................................................................................................................*.................................................. - // gap // ...................................................................................................................................................................... - mul v20.4S, v4.4S, v27.S[2] // ......................................................................................................................*............................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v4.4S, v4.4S, v27.S[3] // .......................................................................................................................*.............................................. - // gap // ...................................................................................................................................................................... - add v23.4S, v21.4S, v23.4S // ..........................................................................................................................*........................................... - // gap // ...................................................................................................................................................................... - mul v21.4S, v17.4S, v27.S[2] // ...........................................................................................................................*.......................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v17.4S, v17.4S, v27.S[3] // ............................................................................................................................*......................................... - // gap // ...................................................................................................................................................................... - mls v20.4S, v4.4S, v8.S[0] // ........................................................................................................................*............................................. - // gap // ...................................................................................................................................................................... - sub v4.4S, v2.4S, v26.4S // ..............................................................................................................................*....................................... - // gap // ...................................................................................................................................................................... - add v2.4S, v2.4S, v26.4S // ...............................................................................................................................*...................................... - // gap // ...................................................................................................................................................................... - mls v21.4S, v17.4S, v8.S[0] // .............................................................................................................................*........................................ - // gap // ...................................................................................................................................................................... - mul v26.4S, v4.4S, v28.S[0] // ................................................................................................................................*..................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v17.4S, v4.4S, v28.S[1] // .................................................................................................................................*.................................... - // gap // ...................................................................................................................................................................... - sub v4.4S, v16.4S, v30.4S // ...................................................................................................................................*.................................. - // gap // ...................................................................................................................................................................... - add v16.4S, v16.4S, v30.4S // ....................................................................................................................................*................................. - // gap // ...................................................................................................................................................................... - sub v30.4S, v0.4S, v2.4S // ........................................................................................................................................*............................. - // gap // ...................................................................................................................................................................... - mls v26.4S, v17.4S, v8.S[0] // ..................................................................................................................................*................................... - // gap // ...................................................................................................................................................................... - mul v17.4S, v4.4S, v28.S[0] // .....................................................................................................................................*................................ - // gap // ...................................................................................................................................................................... - sqrdmulh v4.4S, v4.4S, v28.S[1] // ......................................................................................................................................*............................... - // gap // ...................................................................................................................................................................... - add v0.4S, v0.4S, v2.4S // .........................................................................................................................................*............................ - // gap // ...................................................................................................................................................................... - mul v2.4S, v30.4S, v27.S[0] // ..........................................................................................................................................*........................... - // gap // ...................................................................................................................................................................... - sqrdmulh v30.4S, v30.4S, v27.S[1] // ...........................................................................................................................................*.......................... - // gap // ...................................................................................................................................................................... - mls v17.4S, v4.4S, v8.S[0] // .......................................................................................................................................*.............................. - // gap // ...................................................................................................................................................................... - sub v4.4S, v23.4S, v16.4S // .............................................................................................................................................*........................ - // gap // ...................................................................................................................................................................... - add v16.4S, v23.4S, v16.4S // ..............................................................................................................................................*....................... - // gap // ...................................................................................................................................................................... - mls v2.4S, v30.4S, v8.S[0] // ............................................................................................................................................*......................... - // gap // ...................................................................................................................................................................... - mul v23.4S, v4.4S, v27.S[0] // ...............................................................................................................................................*...................... - // gap // ...................................................................................................................................................................... - sqrdmulh v30.4S, v4.4S, v27.S[1] // ................................................................................................................................................*..................... - // gap // ...................................................................................................................................................................... - sub v4.4S, v20.4S, v26.4S // ..................................................................................................................................................*................... - // gap // ...................................................................................................................................................................... - add v26.4S, v20.4S, v26.4S // ...................................................................................................................................................*.................. - // gap // ...................................................................................................................................................................... - sub v20.4S, v21.4S, v17.4S // .......................................................................................................................................................*.............. - // gap // ...................................................................................................................................................................... - mls v23.4S, v30.4S, v8.S[0] // .................................................................................................................................................*.................... - // gap // ...................................................................................................................................................................... - mul v30.4S, v4.4S, v27.S[0] // ....................................................................................................................................................*................. - // gap // ...................................................................................................................................................................... - sqrdmulh v4.4S, v4.4S, v27.S[1] // .....................................................................................................................................................*................ - // gap // ...................................................................................................................................................................... - add v21.4S, v21.4S, v17.4S // ........................................................................................................................................................*............. - // gap // ...................................................................................................................................................................... - mul v17.4S, v20.4S, v27.S[0] // .........................................................................................................................................................*............ - // gap // ...................................................................................................................................................................... - sqrdmulh v20.4S, v20.4S, v27.S[1] // ..........................................................................................................................................................*........... - // gap // ...................................................................................................................................................................... - mls v30.4S, v4.4S, v8.S[0] // ......................................................................................................................................................*............... - // gap // ...................................................................................................................................................................... - str q0, [x1], #(16*4) // ............................................................................................................................................................*......... - // gap // ...................................................................................................................................................................... - ldr q13, [x5], #(12*16) // ........................e............................................................................................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v17.4S, v20.4S, v8.S[0] // ...........................................................................................................................................................*.......... - // gap // ...................................................................................................................................................................... - str q16, [x1, #-48] // .............................................................................................................................................................*........ - // gap // ...................................................................................................................................................................... - ldr q7, [x5, #-176] // .........................e............................................................................................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - str q26, [x1, #-32] // ..............................................................................................................................................................*....... - // gap // ...................................................................................................................................................................... - ldr q25, [x5, #-160] // ..........................e........................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - str q21, [x1, #-16] // ...............................................................................................................................................................*...... - add x1, x1, #64 // ....................................................................................................................................................................*. - ldr q16, [x1, #0] // e..................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - str q2, [x2], #(16*4) // ................................................................................................................................................................*..... - // gap // ...................................................................................................................................................................... - ldr q2, [x1, #16] // .e.................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - str q23, [x2, #-48] // .................................................................................................................................................................*.... - // gap // ...................................................................................................................................................................... - ldr q23, [x1, #32] // ..e................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - str q30, [x2, #-32] // ..................................................................................................................................................................*... - // gap // ...................................................................................................................................................................... - ldr q21, [x1, #48] // ...e.................................................................................................................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - str q17, [x2, #-16] // ...................................................................................................................................................................*.. - add x2, x2, #64 // .....................................................................................................................................................................* - ldr q26, [x2, #0] // ............e......................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v20.4S, v23.4S, v21.4S // ......e............................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q17, [x2, #16] // .............e........................................................................................................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q30, [x2, #32] // ..............e....................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q4, [x2, #48] // ...............e...................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q11, [x5, #-144] // ...........................e.......................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q3, [x5, #-128] // ............................e......................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q10, [x5, #-112] // .............................e........................................................................................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q19, [x5, #-96] // ..................................................e................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q1, [x5, #-80] // ...................................................e.................................................................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q6, [x5, #-64] // ....................................................e................................................................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q18, [x5, #-48] // .....................................................e................................................................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q29, [x5, #-32] // ......................................................e............................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q24, [x5, #-16] // .......................................................e.............................................................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q27, [x4], #64 // ............................................................................................e......................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q28, [x4, #-48] // .............................................................................................e........................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q5, [x4, #-32] // ..............................................................................................e....................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q22, [x4, #-16] // ...............................................................................................e...................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... + trn1 v0.4S, v30.4S, v24.4S // ....*......................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v10.4S, v30.4S, v24.4S // .....*........................................................................................................................................................................ + // gap // .............................................................................................................................................................................. + trn2 v30.4S, v7.4S, v1.4S // .......*...................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v24.2D, v0.2D, v6.2D // ........*..................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v0.2D, v0.2D, v6.2D // ..........*................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v7.2D, v10.2D, v30.2D // .........*.................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v10.2D, v10.2D, v30.2D // ...........*.................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v30.4S, v24.4S, v7.4S // ...................................*.......................................................................................................................................... + // gap // .............................................................................................................................................................................. + add v24.4S, v24.4S, v7.4S // ....................................*......................................................................................................................................... + // gap // .............................................................................................................................................................................. + sub v7.4S, v0.4S, v10.4S // ..............................*............................................................................................................................................... + // gap // .............................................................................................................................................................................. + add v0.4S, v0.4S, v10.4S // ...............................*.............................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v10.4S, v13.4S, v12.4S // ................*............................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v1.4S, v13.4S, v12.4S // .................*............................................................................................................................................................ + // gap // .............................................................................................................................................................................. + trn1 v13.4S, v17.4S, v27.4S // ..................*........................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v6.4S, v17.4S, v27.4S // ...................*.......................................................................................................................................................... + // gap // .............................................................................................................................................................................. + mul v12.4S, v30.4S, v11.4S // .....................................*........................................................................................................................................ + // gap // .............................................................................................................................................................................. + sqrdmulh v30.4S, v30.4S, v19.4S // ......................................*....................................................................................................................................... + // gap // .............................................................................................................................................................................. + sub v17.4S, v0.4S, v24.4S // ........................................*..................................................................................................................................... + // gap // .............................................................................................................................................................................. + add v0.4S, v0.4S, v24.4S // .........................................*.................................................................................................................................... + // gap // .............................................................................................................................................................................. + mul v24.4S, v7.4S, v28.4S // ................................*............................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v7.4S, v7.4S, v18.4S // .................................*............................................................................................................................................ + // gap // .............................................................................................................................................................................. + trn2 v28.2D, v10.2D, v13.2D // ....................*......................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v27.2D, v1.2D, v6.2D // .....................*........................................................................................................................................................ + // gap // .............................................................................................................................................................................. + trn1 v10.2D, v10.2D, v13.2D // ......................*....................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v1.2D, v1.2D, v6.2D // .......................*...................................................................................................................................................... + // gap // .............................................................................................................................................................................. + mls v24.4S, v7.4S, v8.S[0] // ..................................*........................................................................................................................................... + // gap // .............................................................................................................................................................................. + mls v12.4S, v30.4S, v8.S[0] // .......................................*...................................................................................................................................... + // gap // .............................................................................................................................................................................. + mul v30.4S, v17.4S, v15.4S // ..........................................*................................................................................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v7.4S, v17.4S, v21.4S // ...........................................*.................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v13.4S, v10.4S, v1.4S // ........................................................*..................................................................................................................... + // gap // .............................................................................................................................................................................. + sub v6.4S, v24.4S, v12.4S // .............................................*................................................................................................................................ + // gap // .............................................................................................................................................................................. + add v24.4S, v24.4S, v12.4S // ..............................................*............................................................................................................................... + // gap // .............................................................................................................................................................................. + mls v30.4S, v7.4S, v8.S[0] // ............................................*................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v7.4S, v6.4S, v15.4S // ...............................................*.............................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v6.4S, v6.4S, v21.4S // ................................................*............................................................................................................................. + // gap // .............................................................................................................................................................................. + add v10.4S, v10.4S, v1.4S // .........................................................*.................................................................................................................... + // gap // .............................................................................................................................................................................. + mul v1.4S, v13.4S, v25.4S // ..........................................................*................................................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v12.4S, v0.4S, v24.4S // ............................................................................*................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v0.4S, v0.4S, v24.4S // .............................................................................*................................................................................................ + // gap // .............................................................................................................................................................................. + mls v7.4S, v6.4S, v8.S[0] // .................................................*............................................................................................................................ + // gap // .............................................................................................................................................................................. + sqrdmulh v24.4S, v13.4S, v14.4S // ...........................................................*.................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v13.4S, v28.4S, v27.4S // .............................................................*................................................................................................................ + // gap // .............................................................................................................................................................................. + add v6.4S, v28.4S, v27.4S // ..............................................................*............................................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v17.4S, v30.4S, v7.4S // ..............................................................................*............................................................................................... + // gap // .............................................................................................................................................................................. + mls v1.4S, v24.4S, v8.S[0] // ............................................................*................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v24.4S, v13.4S, v5.4S // ...............................................................*.............................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v13.4S, v29.4S // ................................................................*............................................................................................................. + // gap // .............................................................................................................................................................................. + sub v28.4S, v10.4S, v6.4S // ..................................................................*........................................................................................................... + // gap // .............................................................................................................................................................................. + add v10.4S, v10.4S, v6.4S // ...................................................................*.......................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v30.4S, v30.4S, v7.4S // ...............................................................................*.............................................................................................. + // gap // .............................................................................................................................................................................. + mls v24.4S, v13.4S, v8.S[0] // .................................................................*............................................................................................................ + // gap // .............................................................................................................................................................................. + mul v7.4S, v28.4S, v2.4S // ....................................................................*......................................................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v28.4S, v4.4S // .....................................................................*........................................................................................................ + // gap // .............................................................................................................................................................................. + trn2 v6.2D, v12.2D, v17.2D // ................................................................................*............................................................................................. + // gap // .............................................................................................................................................................................. + sub v28.4S, v1.4S, v24.4S // .......................................................................*...................................................................................................... + // gap // .............................................................................................................................................................................. + add v24.4S, v1.4S, v24.4S // ........................................................................*..................................................................................................... + // gap // .............................................................................................................................................................................. + mls v7.4S, v13.4S, v8.S[0] // ......................................................................*....................................................................................................... + // gap // .............................................................................................................................................................................. + mul v1.4S, v28.4S, v2.4S // .........................................................................*.................................................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v28.4S, v4.4S // ..........................................................................*................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v28.2D, v0.2D, v30.2D // .................................................................................*............................................................................................ + // gap // .............................................................................................................................................................................. + trn1 v12.2D, v12.2D, v17.2D // ..................................................................................*........................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v0.2D, v0.2D, v30.2D // ...................................................................................*.......................................................................................... + // gap // .............................................................................................................................................................................. + mls v1.4S, v13.4S, v8.S[0] // ...........................................................................*.................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v30.4S, v10.4S, v24.4S // ....................................................................................*......................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v10.4S, v10.4S, v24.4S // .....................................................................................*........................................................................................ + // gap // .............................................................................................................................................................................. + sub v24.4S, v12.4S, v0.4S // ................................................................................................*............................................................................. + // gap // .............................................................................................................................................................................. + trn1 v13.4S, v7.4S, v1.4S // ......................................................................................*....................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v7.4S, v7.4S, v1.4S // .......................................................................................*...................................................................................... + // gap // .............................................................................................................................................................................. + add v0.4S, v12.4S, v0.4S // .................................................................................................*............................................................................ + // gap // .............................................................................................................................................................................. + trn2 v1.2D, v30.2D, v13.2D // ........................................................................................*..................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v12.2D, v10.2D, v7.2D // .........................................................................................*.................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v30.2D, v30.2D, v13.2D // ..........................................................................................*................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v10.2D, v10.2D, v7.2D // ...........................................................................................*.................................................................................. + // gap // .............................................................................................................................................................................. + mul v7.4S, v24.4S, v9.S[2] // ..................................................................................................*........................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v24.4S, v24.4S, v9.S[3] // ...................................................................................................*.......................................................................... + // gap // .............................................................................................................................................................................. + sub v13.4S, v6.4S, v28.4S // .....................................................................................................*........................................................................ + // gap // .............................................................................................................................................................................. + add v6.4S, v6.4S, v28.4S // ......................................................................................................*....................................................................... + // gap // .............................................................................................................................................................................. + sub v17.4S, v30.4S, v10.4S // ..........................................................................................................*................................................................... + // gap // .............................................................................................................................................................................. + mls v7.4S, v24.4S, v8.S[0] // ....................................................................................................*......................................................................... + // gap // .............................................................................................................................................................................. + mul v24.4S, v13.4S, v31.S[0] // .......................................................................................................*...................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v13.4S, v31.S[1] // ........................................................................................................*..................................................................... + // gap // .............................................................................................................................................................................. + add v10.4S, v30.4S, v10.4S // ...........................................................................................................*.................................................................. + // gap // .............................................................................................................................................................................. + mul v30.4S, v17.4S, v31.S[2] // ............................................................................................................*................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v17.4S, v17.4S, v31.S[3] // .............................................................................................................*................................................................ + // gap // .............................................................................................................................................................................. + mls v24.4S, v13.4S, v8.S[0] // .........................................................................................................*.................................................................... + // gap // .............................................................................................................................................................................. + sub v13.4S, v1.4S, v12.4S // ...............................................................................................................*.............................................................. + // gap // .............................................................................................................................................................................. + add v1.4S, v1.4S, v12.4S // ................................................................................................................*............................................................. + // gap // .............................................................................................................................................................................. + mls v30.4S, v17.4S, v8.S[0] // ..............................................................................................................*............................................................... + // gap // .............................................................................................................................................................................. + mul v12.4S, v13.4S, v3.S[0] // .................................................................................................................*............................................................ + // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v13.4S, v3.S[1] // ..................................................................................................................*........................................................... + // gap // .............................................................................................................................................................................. + sub v17.4S, v0.4S, v6.4S // ....................................................................................................................*......................................................... + // gap // .............................................................................................................................................................................. + add v0.4S, v0.4S, v6.4S // .....................................................................................................................*........................................................ + // gap // .............................................................................................................................................................................. + sub v6.4S, v7.4S, v24.4S // .........................................................................................................................*.................................................... + // gap // .............................................................................................................................................................................. + mls v12.4S, v13.4S, v8.S[0] // ...................................................................................................................*.......................................................... + // gap // .............................................................................................................................................................................. + mul v13.4S, v17.4S, v23.S[2] // ......................................................................................................................*....................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v17.4S, v17.4S, v23.S[3] // .......................................................................................................................*...................................................... + // gap // .............................................................................................................................................................................. + add v24.4S, v7.4S, v24.4S // ..........................................................................................................................*................................................... + // gap // .............................................................................................................................................................................. + mul v7.4S, v6.4S, v23.S[2] // ...........................................................................................................................*.................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v6.4S, v6.4S, v23.S[3] // ............................................................................................................................*................................................. + // gap // .............................................................................................................................................................................. + mls v13.4S, v17.4S, v8.S[0] // ........................................................................................................................*..................................................... + // gap // .............................................................................................................................................................................. + sub v17.4S, v10.4S, v1.4S // ..............................................................................................................................*............................................... + // gap // .............................................................................................................................................................................. + add v10.4S, v10.4S, v1.4S // ...............................................................................................................................*.............................................. + // gap // .............................................................................................................................................................................. + mls v7.4S, v6.4S, v8.S[0] // .............................................................................................................................*................................................ + // gap // .............................................................................................................................................................................. + mul v1.4S, v17.4S, v9.S[0] // ................................................................................................................................*............................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v6.4S, v17.4S, v9.S[1] // .................................................................................................................................*............................................ + // gap // .............................................................................................................................................................................. + sub v17.4S, v30.4S, v12.4S // ...................................................................................................................................*.......................................... + // gap // .............................................................................................................................................................................. + add v30.4S, v30.4S, v12.4S // ....................................................................................................................................*......................................... + // gap // .............................................................................................................................................................................. + srshr v12.4S, v0.4S, #23 // ........................................................................................................................................*..................................... + // gap // .............................................................................................................................................................................. + mls v1.4S, v6.4S, v8.S[0] // ..................................................................................................................................*........................................... + // gap // .............................................................................................................................................................................. + mul v6.4S, v17.4S, v9.S[0] // .....................................................................................................................................*........................................ + // gap // .............................................................................................................................................................................. + sqrdmulh v17.4S, v17.4S, v9.S[1] // ......................................................................................................................................*....................................... + // gap // .............................................................................................................................................................................. + mls v0.4S, v12.4S, v8.4S // .........................................................................................................................................*.................................... + // gap // .............................................................................................................................................................................. + srshr v12.4S, v24.4S, #23 // ..........................................................................................................................................*................................... + // gap // .............................................................................................................................................................................. + srshr v28.4S, v10.4S, #23 // ............................................................................................................................................*................................. + // gap // .............................................................................................................................................................................. + mls v6.4S, v17.4S, v8.S[0] // .......................................................................................................................................*...................................... + // gap // .............................................................................................................................................................................. + mls v24.4S, v12.4S, v8.4S // ...........................................................................................................................................*.................................. + // gap // .............................................................................................................................................................................. + mls v10.4S, v28.4S, v8.4S // .............................................................................................................................................*................................ + // gap // .............................................................................................................................................................................. + srshr v12.4S, v30.4S, #23 // ..............................................................................................................................................*............................... + // gap // .............................................................................................................................................................................. + sub v17.4S, v13.4S, v1.4S // ..........................................................................................................................................................*................... + // gap // .............................................................................................................................................................................. + add v1.4S, v13.4S, v1.4S // ...........................................................................................................................................................*.................. + // gap // .............................................................................................................................................................................. + mls v30.4S, v12.4S, v8.4S // ...............................................................................................................................................*.............................. + // gap // .............................................................................................................................................................................. + sub v13.4S, v0.4S, v10.4S // ................................................................................................................................................*............................. + // gap // .............................................................................................................................................................................. + add v0.4S, v0.4S, v10.4S // .................................................................................................................................................*............................ + // gap // .............................................................................................................................................................................. + mul v10.4S, v17.4S, v23.S[0] // ............................................................................................................................................................*................. + // gap // .............................................................................................................................................................................. + mul v12.4S, v13.4S, v23.S[0] // ..................................................................................................................................................*........................... + // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v13.4S, v23.S[1] // ...................................................................................................................................................*.......................... + // gap // .............................................................................................................................................................................. + sub v28.4S, v24.4S, v30.4S // .....................................................................................................................................................*........................ + // gap // .............................................................................................................................................................................. + add v30.4S, v24.4S, v30.4S // ......................................................................................................................................................*....................... + // gap // .............................................................................................................................................................................. + sqrdmulh v24.4S, v17.4S, v23.S[1] // .............................................................................................................................................................*................ + // gap // .............................................................................................................................................................................. + mls v12.4S, v13.4S, v8.S[0] // ....................................................................................................................................................*......................... + // gap // .............................................................................................................................................................................. + mul v13.4S, v28.4S, v23.S[0] // .......................................................................................................................................................*...................... + // gap // .............................................................................................................................................................................. + sqrdmulh v17.4S, v28.4S, v23.S[1] // ........................................................................................................................................................*..................... + // gap // .............................................................................................................................................................................. + mls v10.4S, v24.4S, v8.S[0] // ..............................................................................................................................................................*............... + // gap // .............................................................................................................................................................................. + sub v24.4S, v7.4S, v6.4S // ...............................................................................................................................................................*.............. + // gap // .............................................................................................................................................................................. + add v7.4S, v7.4S, v6.4S // ................................................................................................................................................................*............. + // gap // .............................................................................................................................................................................. + mls v13.4S, v17.4S, v8.S[0] // .........................................................................................................................................................*.................... + // gap // .............................................................................................................................................................................. + mul v6.4S, v24.4S, v23.S[0] // .................................................................................................................................................................*............ + // gap // .............................................................................................................................................................................. + sqrdmulh v24.4S, v24.4S, v23.S[1] // ..................................................................................................................................................................*........... + // gap // .............................................................................................................................................................................. + str q0, [x1], #(16*4) // ....................................................................................................................................................................*......... + // gap // .............................................................................................................................................................................. + ldr q15, [x5], #(12*16) // ........................e..................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v6.4S, v24.4S, v8.S[0] // ...................................................................................................................................................................*.......... + // gap // .............................................................................................................................................................................. + str q30, [x1, #-48] // .....................................................................................................................................................................*........ + // gap // .............................................................................................................................................................................. + ldr q21, [x5, #-176] // .........................e.................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q1, [x1, #-32] // ......................................................................................................................................................................*....... + // gap // .............................................................................................................................................................................. + ldr q28, [x5, #-160] // ..........................e................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q7, [x1, #-16] // .......................................................................................................................................................................*...... + add x1, x1, #64 // ............................................................................................................................................................................*. + ldr q30, [x1, #0] // e............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q12, [x2], #(16*4) // ........................................................................................................................................................................*..... + // gap // .............................................................................................................................................................................. + ldr q24, [x1, #16] // .e............................................................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q13, [x2, #-48] // .........................................................................................................................................................................*.... + // gap // .............................................................................................................................................................................. + ldr q7, [x1, #32] // ..e........................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q10, [x2, #-32] // ..........................................................................................................................................................................*... + // gap // .............................................................................................................................................................................. + ldr q1, [x1, #48] // ...e.......................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q6, [x2, #-16] // ...........................................................................................................................................................................*.. + add x2, x2, #64 // .............................................................................................................................................................................* + ldr q13, [x2, #0] // ............e................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v6.4S, v7.4S, v1.4S // ......e....................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + ldr q12, [x2, #16] // .............e................................................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q17, [x2, #32] // ..............e............................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q27, [x2, #48] // ...............e.............................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q18, [x5, #-144] // ...........................e.................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q11, [x5, #-128] // ............................e................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q19, [x5, #-112] // .............................e................................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q2, [x5, #-96] // ..................................................e........................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q4, [x5, #-80] // ...................................................e.......................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q25, [x5, #-64] // ....................................................e......................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q14, [x5, #-48] // .....................................................e........................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q5, [x5, #-32] // ......................................................e....................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q29, [x5, #-16] // .......................................................e...................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q23, [x4], #64 // ............................................................................................e................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q9, [x4, #-48] // .............................................................................................e................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q31, [x4, #-32] // ..............................................................................................e............................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q3, [x4, #-16] // ...............................................................................................e.............................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. // original source code - // ldr q9, [x1, #0] // ........e..........................|..........................................................................................................................................e........ - // ldr q10, [x1, #16] // ..........e........................|............................................................................................................................................e...... - // ldr q11, [x1, #32] // ............e......................|..............................................................................................................................................e.... - // ldr q12, [x1, #48] // ..............e....................|................................................................................................................................................e.. - // trn1 v25.4s, v9.4s, v10.4s // ...................................*................................................................................................................................................... - // trn2 v26.4s, v9.4s, v10.4s // ...................................|*.................................................................................................................................................. - // trn1 v27.4s, v11.4s, v12.4s // ..................e................|................................................................................................................................................... - // trn2 v28.4s, v11.4s, v12.4s // ...................................|.*................................................................................................................................................. - // trn2 v11.2d, v25.2d, v27.2d // ...................................|..*................................................................................................................................................ - // trn2 v12.2d, v26.2d, v28.2d // ...................................|....*.............................................................................................................................................. - // trn1 v9.2d, v25.2d, v27.2d // ...................................|...*............................................................................................................................................... - // trn1 v10.2d, v26.2d, v28.2d // ...................................|.....*............................................................................................................................................. - // ldr q13, [x2, #0] // .................e.................|................................................................................................................................................... - // ldr q14, [x2, #16] // ...................e...............|................................................................................................................................................... - // ldr q15, [x2, #32] // ....................e..............|................................................................................................................................................... - // ldr q16, [x2, #48] // .....................e.............|................................................................................................................................................... - // trn1 v25.4s, v13.4s, v14.4s // ...................................|..........*........................................................................................................................................ - // trn2 v26.4s, v13.4s, v14.4s // ...................................|...........*....................................................................................................................................... - // trn1 v27.4s, v15.4s, v16.4s // ...................................|............*...................................................................................................................................... - // trn2 v28.4s, v15.4s, v16.4s // ...................................|.............*..................................................................................................................................... - // trn2 v15.2d, v25.2d, v27.2d // ...................................|....................*.............................................................................................................................. - // trn2 v16.2d, v26.2d, v28.2d // ...................................|.....................*............................................................................................................................. - // trn1 v13.2d, v25.2d, v27.2d // ...................................|......................*............................................................................................................................ - // trn1 v14.2d, v26.2d, v28.2d // ...................................|.......................*........................................................................................................................... - // ldr q0, [x5], #(12*16) // e..................................|..................................................................................................................................e................ - // ldr q4, [x5, #(-12*16 + 1*16)] // ...e...............................|.....................................................................................................................................e............. - // ldr q1, [x5, #(-12*16 + 2*16)] // .....e.............................|.......................................................................................................................................e........... - // ldr q5, [x5, #(-12*16 + 3*16)] // ......................e............|................................................................................................................................................... - // ldr q2, [x5, #(-12*16 + 4*16)] // .......................e...........|................................................................................................................................................... - // ldr q6, [x5, #(-12*16 + 5*16)] // ........................e..........|................................................................................................................................................... - // sub v24.4s, v9.4s, v10.4s // ...................................|........*.......................................................................................................................................... - // add v9.4s, v9.4s, v10.4s // ...................................|.........*......................................................................................................................................... - // mul v10.4s, v24.4s, v1.4s // ...................................|..................*................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v5.4s // ...................................|...................*............................................................................................................................... - // mls v10.4s, v24.4s, v8.s[0] // ...................................|........................*.......................................................................................................................... - // sub v24.4s, v11.4s, v12.4s // ...................................|......*............................................................................................................................................ - // add v11.4s, v11.4s, v12.4s // ...................................|.......*........................................................................................................................................... - // mul v12.4s, v24.4s, v2.4s // ...................................|..............*.................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ...................................|...............*................................................................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ...................................|.........................*......................................................................................................................... - // sub v24.4s, v9.4s, v11.4s // ...................................|................*.................................................................................................................................. - // add v9.4s, v9.4s, v11.4s // ...................................|.................*................................................................................................................................. - // mul v11.4s, v24.4s, v0.4s // ...................................|..........................*........................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|...........................*....................................................................................................................... - // mls v11.4s, v24.4s, v8.s[0] // ...................................|...............................*................................................................................................................... - // sub v24.4s, v10.4s, v12.4s // ...................................|.............................*..................................................................................................................... - // add v10.4s, v10.4s, v12.4s // ...................................|..............................*.................................................................................................................... - // mul v12.4s, v24.4s, v0.4s // ...................................|................................*.................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|.................................*................................................................................................................. - // mls v12.4s, v24.4s, v8.s[0] // ...................................|......................................*............................................................................................................ - // ldr q0, [x5, #(-12*16 + 6*16)] // .........................e.........|................................................................................................................................................... - // ldr q4, [x5, #(-12*16 + 7*16)] // ..........................e........|................................................................................................................................................... - // ldr q1, [x5, #(-12*16 + 8*16)] // ...........................e.......|................................................................................................................................................... - // ldr q5, [x5, #(-12*16 + 9*16)] // ............................e......|................................................................................................................................................... - // ldr q2, [x5, #(-12*16 + 10*16)] // .............................e.....|................................................................................................................................................... - // ldr q6, [x5, #(-12*16 + 11*16)] // ..............................e....|................................................................................................................................................... - // sub v24.4s, v13.4s, v14.4s // ...................................|............................*...................................................................................................................... - // add v13.4s, v13.4s, v14.4s // ...................................|..................................*................................................................................................................ - // mul v14.4s, v24.4s, v1.4s // ...................................|...................................*............................................................................................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // ...................................|.......................................*........................................................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ...................................|...........................................*....................................................................................................... - // sub v24.4s, v15.4s, v16.4s // ...................................|........................................*.......................................................................................................... - // add v15.4s, v15.4s, v16.4s // ...................................|.........................................*......................................................................................................... - // mul v16.4s, v24.4s, v2.4s // ...................................|............................................*...................................................................................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ...................................|.............................................*..................................................................................................... - // mls v16.4s, v24.4s, v8.s[0] // ...................................|.................................................*................................................................................................. - // sub v24.4s, v13.4s, v15.4s // ...................................|..............................................*.................................................................................................... - // add v13.4s, v13.4s, v15.4s // ...................................|...............................................*................................................................................................... - // mul v15.4s, v24.4s, v0.4s // ...................................|..................................................*................................................................................................ - // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|...................................................*............................................................................................... - // mls v15.4s, v24.4s, v8.s[0] // ...................................|.......................................................*........................................................................................... - // sub v24.4s, v14.4s, v16.4s // ...................................|.....................................................*............................................................................................. - // add v14.4s, v14.4s, v16.4s // ...................................|......................................................*............................................................................................ - // mul v16.4s, v24.4s, v0.4s // ...................................|........................................................*.......................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|.........................................................*......................................................................................... - // mls v16.4s, v24.4s, v8.s[0] // ...................................|.............................................................*..................................................................................... - // trn1 v25.4s, v9.4s, v10.4s // ...................................|....................................*.............................................................................................................. - // trn2 v26.4s, v9.4s, v10.4s // ...................................|.....................................*............................................................................................................. - // trn1 v27.4s, v11.4s, v12.4s // ...................................|..........................................*........................................................................................................ - // trn2 v28.4s, v11.4s, v12.4s // ...................................|................................................*.................................................................................................. - // trn2 v11.2d, v25.2d, v27.2d // ...................................|....................................................*.............................................................................................. - // trn2 v12.2d, v26.2d, v28.2d // ...................................|..........................................................*........................................................................................ - // trn1 v9.2d, v25.2d, v27.2d // ...................................|...........................................................*....................................................................................... - // trn1 v10.2d, v26.2d, v28.2d // ...................................|............................................................*...................................................................................... - // trn1 v25.4s, v13.4s, v14.4s // ...................................|..............................................................*.................................................................................... - // trn2 v26.4s, v13.4s, v14.4s // ...................................|...............................................................*................................................................................... - // trn1 v27.4s, v15.4s, v16.4s // ...................................|.................................................................*................................................................................. - // trn2 v28.4s, v15.4s, v16.4s // ...................................|..................................................................*................................................................................ - // trn2 v15.2d, v25.2d, v27.2d // ...................................|....................................................................*.............................................................................. - // trn2 v16.2d, v26.2d, v28.2d // ...................................|.....................................................................*............................................................................. - // trn1 v13.2d, v25.2d, v27.2d // ...................................|......................................................................*............................................................................ - // trn1 v14.2d, v26.2d, v28.2d // ...................................|.......................................................................*........................................................................... - // ldr q0, [x4], #64 // ...............................e...|................................................................................................................................................... - // ldr q1, [x4, #(-64 + 16)] // ................................e..|................................................................................................................................................... - // ldr q2, [x4, #(-64 + 32)] // .................................e.|................................................................................................................................................... - // ldr q3, [x4, #(-64 + 48)] // ..................................e|................................................................................................................................................... - // sub v24.4s, v9.4s, v10.4s // ...................................|................................................................*.................................................................................. - // add v9.4s, v9.4s, v10.4s // ...................................|...................................................................*............................................................................... - // mul v10.4s, v24.4s, v1.s[2] // ...................................|........................................................................*.......................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...................................|.........................................................................*......................................................................... - // mls v10.4s, v24.4s, v8.s[0] // ...................................|.............................................................................*..................................................................... - // sub v24.4s, v11.4s, v12.4s // ...................................|..........................................................................*........................................................................ - // add v11.4s, v11.4s, v12.4s // ...................................|...........................................................................*....................................................................... - // mul v12.4s, v24.4s, v2.s[0] // ...................................|..............................................................................*.................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...................................|...............................................................................*................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ...................................|...................................................................................*............................................................... - // sub v24.4s, v13.4s, v14.4s // ...................................|............................................................................*...................................................................... - // add v13.4s, v13.4s, v14.4s // ...................................|................................................................................*.................................................................. - // mul v14.4s, v24.4s, v2.s[2] // ...................................|.................................................................................*................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...................................|..................................................................................*................................................................ - // mls v14.4s, v24.4s, v8.s[0] // ...................................|......................................................................................*............................................................ - // sub v24.4s, v15.4s, v16.4s // ...................................|....................................................................................*.............................................................. - // add v15.4s, v15.4s, v16.4s // ...................................|.....................................................................................*............................................................. - // mul v16.4s, v24.4s, v3.s[0] // ...................................|.......................................................................................*........................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...................................|........................................................................................*.......................................................... - // mls v16.4s, v24.4s, v8.s[0] // ...................................|............................................................................................*...................................................... - // sub v24.4s, v9.4s, v11.4s // ...................................|.........................................................................................*......................................................... - // add v9.4s, v9.4s, v11.4s // ...................................|..........................................................................................*........................................................ - // mul v11.4s, v24.4s, v0.s[2] // ...................................|.............................................................................................*..................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................|..............................................................................................*.................................................... - // mls v11.4s, v24.4s, v8.s[0] // ...................................|..................................................................................................*................................................ - // sub v24.4s, v10.4s, v12.4s // ...................................|...........................................................................................*....................................................... - // add v10.4s, v10.4s, v12.4s // ...................................|...............................................................................................*................................................... - // mul v12.4s, v24.4s, v0.s[2] // ...................................|................................................................................................*.................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................|.................................................................................................*................................................. - // mls v12.4s, v24.4s, v8.s[0] // ...................................|.....................................................................................................*............................................. - // sub v24.4s, v13.4s, v15.4s // ...................................|...................................................................................................*............................................... - // add v13.4s, v13.4s, v15.4s // ...................................|....................................................................................................*.............................................. - // mul v15.4s, v24.4s, v1.s[0] // ...................................|......................................................................................................*............................................ - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................|.......................................................................................................*........................................... - // mls v15.4s, v24.4s, v8.s[0] // ...................................|...........................................................................................................*....................................... - // sub v24.4s, v14.4s, v16.4s // ...................................|........................................................................................................*.......................................... - // add v14.4s, v14.4s, v16.4s // ...................................|.........................................................................................................*......................................... - // mul v16.4s, v24.4s, v1.s[0] // ...................................|............................................................................................................*...................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................|.............................................................................................................*..................................... - // mls v16.4s, v24.4s, v8.s[0] // ...................................|.................................................................................................................*................................. - // sub v24.4s, v9.4s, v13.4s // ...................................|..........................................................................................................*........................................ - // add v9.4s, v9.4s, v13.4s // ...................................|..............................................................................................................*.................................... - // mul v13.4s, v24.4s, v0.s[0] // ...................................|...............................................................................................................*................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|................................................................................................................*.................................. - // mls v13.4s, v24.4s, v8.s[0] // ...................................|....................................................................................................................*.............................. - // sub v24.4s, v10.4s, v14.4s // ...................................|..................................................................................................................*................................ - // add v10.4s, v10.4s, v14.4s // ...................................|...................................................................................................................*............................... - // mul v14.4s, v24.4s, v0.s[0] // ...................................|.....................................................................................................................*............................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|......................................................................................................................*............................ - // mls v14.4s, v24.4s, v8.s[0] // ...................................|..........................................................................................................................*........................ - // sub v24.4s, v11.4s, v15.4s // ...................................|.......................................................................................................................*........................... - // add v11.4s, v11.4s, v15.4s // ...................................|........................................................................................................................*.......................... - // mul v15.4s, v24.4s, v0.s[0] // ...................................|...........................................................................................................................*....................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|............................................................................................................................*...................... - // mls v15.4s, v24.4s, v8.s[0] // ...................................|................................................................................................................................*.................. - // sub v24.4s, v12.4s, v16.4s // ...................................|.........................................................................................................................*......................... - // add v12.4s, v12.4s, v16.4s // ...................................|.............................................................................................................................*..................... - // mul v16.4s, v24.4s, v0.s[0] // ...................................|..............................................................................................................................*.................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|...............................................................................................................................*................... - // mls v16.4s, v24.4s, v8.s[0] // .*.................................|...................................................................................................................................*............... - // str q9, [x1], #(16*4) // ...................................|.................................................................................................................................*................. - // str q10, [x1, #(-16*4 + 1*16)] // ..*................................|....................................................................................................................................*.............. - // str q11, [x1, #(-16*4 + 2*16)] // ....*..............................|......................................................................................................................................*............ - // str q12, [x1, #(-16*4 + 3*16)] // ......*............................|........................................................................................................................................*.......... - // str q13, [x2], #(16*4) // .........*.........................|...........................................................................................................................................*....... - // str q14, [x2, #(-16*4 + 1*16)] // ...........*.......................|.............................................................................................................................................*..... - // str q15, [x2, #(-16*4 + 2*16)] // .............*.....................|...............................................................................................................................................*... - // str q16, [x2, #(-16*4 + 3*16)] // ...............*...................|.................................................................................................................................................*. - // add x1, x1, #64 // .......*...........................|.........................................................................................................................................*......... - // add x2, x2, #64 // ................*..................|..................................................................................................................................................* + // ldr q9, [x1, #0] // ........e..........................|..................................................................................................................................................e........ + // ldr q10, [x1, #16] // ..........e........................|....................................................................................................................................................e...... + // ldr q11, [x1, #32] // ............e......................|......................................................................................................................................................e.... + // ldr q12, [x1, #48] // ..............e....................|........................................................................................................................................................e.. + // trn1 v25.4s, v9.4s, v10.4s // ...................................*........................................................................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ...................................|*.......................................................................................................................................................... + // trn1 v27.4s, v11.4s, v12.4s // ..................e................|........................................................................................................................................................... + // trn2 v28.4s, v11.4s, v12.4s // ...................................|.*......................................................................................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // ...................................|..*........................................................................................................................................................ + // trn2 v12.2d, v26.2d, v28.2d // ...................................|....*...................................................................................................................................................... + // trn1 v9.2d, v25.2d, v27.2d // ...................................|...*....................................................................................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ...................................|.....*..................................................................................................................................................... + // ldr q13, [x2, #0] // .................e.................|........................................................................................................................................................... + // ldr q14, [x2, #16] // ...................e...............|........................................................................................................................................................... + // ldr q15, [x2, #32] // ....................e..............|........................................................................................................................................................... + // ldr q16, [x2, #48] // .....................e.............|........................................................................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ...................................|..........*................................................................................................................................................ + // trn2 v26.4s, v13.4s, v14.4s // ...................................|...........*............................................................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ...................................|............*.............................................................................................................................................. + // trn2 v28.4s, v15.4s, v16.4s // ...................................|.............*............................................................................................................................................. + // trn2 v15.2d, v25.2d, v27.2d // ...................................|....................*...................................................................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ...................................|.....................*..................................................................................................................................... + // trn1 v13.2d, v25.2d, v27.2d // ...................................|......................*.................................................................................................................................... + // trn1 v14.2d, v26.2d, v28.2d // ...................................|.......................*................................................................................................................................... + // ldr q0, [x5], #(12*16) // e..................................|..........................................................................................................................................e................ + // ldr q4, [x5, #(-12*16 + 1*16)] // ...e...............................|.............................................................................................................................................e............. + // ldr q1, [x5, #(-12*16 + 2*16)] // .....e.............................|...............................................................................................................................................e........... + // ldr q5, [x5, #(-12*16 + 3*16)] // ......................e............|........................................................................................................................................................... + // ldr q2, [x5, #(-12*16 + 4*16)] // .......................e...........|........................................................................................................................................................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ........................e..........|........................................................................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ...................................|........*.................................................................................................................................................. + // add v9.4s, v9.4s, v10.4s // ...................................|.........*................................................................................................................................................. + // mul v10.4s, v24.4s, v1.4s // ...................................|..................*........................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v5.4s // ...................................|...................*....................................................................................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ...................................|........................*.................................................................................................................................. + // sub v24.4s, v11.4s, v12.4s // ...................................|......*.................................................................................................................................................... + // add v11.4s, v11.4s, v12.4s // ...................................|.......*................................................................................................................................................... + // mul v12.4s, v24.4s, v2.4s // ...................................|..............*............................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v6.4s // ...................................|...............*........................................................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................................|.........................*................................................................................................................................. + // sub v24.4s, v9.4s, v11.4s // ...................................|................*.......................................................................................................................................... + // add v9.4s, v9.4s, v11.4s // ...................................|.................*......................................................................................................................................... + // mul v11.4s, v24.4s, v0.4s // ...................................|..........................*................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|...........................*............................................................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ...................................|...............................*........................................................................................................................... + // sub v24.4s, v10.4s, v12.4s // ...................................|.............................*............................................................................................................................. + // add v10.4s, v10.4s, v12.4s // ...................................|..............................*............................................................................................................................ + // mul v12.4s, v24.4s, v0.4s // ...................................|................................*.......................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|.................................*......................................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................................|......................................*.................................................................................................................... + // ldr q0, [x5, #(-12*16 + 6*16)] // .........................e.........|........................................................................................................................................................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ..........................e........|........................................................................................................................................................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ...........................e.......|........................................................................................................................................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // ............................e......|........................................................................................................................................................... + // ldr q2, [x5, #(-12*16 + 10*16)] // .............................e.....|........................................................................................................................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ..............................e....|........................................................................................................................................................... + // sub v24.4s, v13.4s, v14.4s // ...................................|............................*.............................................................................................................................. + // add v13.4s, v13.4s, v14.4s // ...................................|..................................*........................................................................................................................ + // mul v14.4s, v24.4s, v1.4s // ...................................|...................................*....................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ...................................|.......................................*................................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ...................................|...........................................*............................................................................................................... + // sub v24.4s, v15.4s, v16.4s // ...................................|........................................*.................................................................................................................. + // add v15.4s, v15.4s, v16.4s // ...................................|.........................................*................................................................................................................. + // mul v16.4s, v24.4s, v2.4s // ...................................|............................................*.............................................................................................................. + // sqrdmulh v24.4s, v24.4s, v6.4s // ...................................|.............................................*............................................................................................................. + // mls v16.4s, v24.4s, v8.s[0] // ...................................|.................................................*......................................................................................................... + // sub v24.4s, v13.4s, v15.4s // ...................................|..............................................*............................................................................................................ + // add v13.4s, v13.4s, v15.4s // ...................................|...............................................*........................................................................................................... + // mul v15.4s, v24.4s, v0.4s // ...................................|..................................................*........................................................................................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|...................................................*....................................................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ...................................|.......................................................*................................................................................................... + // sub v24.4s, v14.4s, v16.4s // ...................................|.....................................................*..................................................................................................... + // add v14.4s, v14.4s, v16.4s // ...................................|......................................................*.................................................................................................... + // mul v16.4s, v24.4s, v0.4s // ...................................|........................................................*.................................................................................................. + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|.........................................................*................................................................................................. + // mls v16.4s, v24.4s, v8.s[0] // ...................................|.............................................................*............................................................................................. + // trn1 v25.4s, v9.4s, v10.4s // ...................................|....................................*...................................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ...................................|.....................................*..................................................................................................................... + // trn1 v27.4s, v11.4s, v12.4s // ...................................|..........................................*................................................................................................................ + // trn2 v28.4s, v11.4s, v12.4s // ...................................|................................................*.......................................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // ...................................|....................................................*...................................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // ...................................|..........................................................*................................................................................................ + // trn1 v9.2d, v25.2d, v27.2d // ...................................|...........................................................*............................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ...................................|............................................................*.............................................................................................. + // trn1 v25.4s, v13.4s, v14.4s // ...................................|..............................................................*............................................................................................ + // trn2 v26.4s, v13.4s, v14.4s // ...................................|...............................................................*........................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ...................................|.................................................................*......................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // ...................................|..................................................................*........................................................................................ + // trn2 v15.2d, v25.2d, v27.2d // ...................................|....................................................................*...................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ...................................|.....................................................................*..................................................................................... + // trn1 v13.2d, v25.2d, v27.2d // ...................................|......................................................................*.................................................................................... + // trn1 v14.2d, v26.2d, v28.2d // ...................................|.......................................................................*................................................................................... + // ldr q0, [x4], #64 // ...............................e...|........................................................................................................................................................... + // ldr q1, [x4, #(-64 + 16)] // ................................e..|........................................................................................................................................................... + // ldr q2, [x4, #(-64 + 32)] // .................................e.|........................................................................................................................................................... + // ldr q3, [x4, #(-64 + 48)] // ..................................e|........................................................................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ...................................|................................................................*.......................................................................................... + // add v9.4s, v9.4s, v10.4s // ...................................|...................................................................*....................................................................................... + // mul v10.4s, v24.4s, v1.s[2] // ...................................|........................................................................*.................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...................................|.........................................................................*................................................................................. + // mls v10.4s, v24.4s, v8.s[0] // ...................................|.............................................................................*............................................................................. + // sub v24.4s, v11.4s, v12.4s // ...................................|..........................................................................*................................................................................ + // add v11.4s, v11.4s, v12.4s // ...................................|...........................................................................*............................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ...................................|..............................................................................*............................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...................................|...............................................................................*........................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................................|...................................................................................*....................................................................... + // sub v24.4s, v13.4s, v14.4s // ...................................|............................................................................*.............................................................................. + // add v13.4s, v13.4s, v14.4s // ...................................|................................................................................*.......................................................................... + // mul v14.4s, v24.4s, v2.s[2] // ...................................|.................................................................................*......................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...................................|..................................................................................*........................................................................ + // mls v14.4s, v24.4s, v8.s[0] // ...................................|......................................................................................*.................................................................... + // sub v24.4s, v15.4s, v16.4s // ...................................|....................................................................................*...................................................................... + // add v15.4s, v15.4s, v16.4s // ...................................|.....................................................................................*..................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ...................................|.......................................................................................*................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...................................|........................................................................................*.................................................................. + // mls v16.4s, v24.4s, v8.s[0] // ...................................|............................................................................................*.............................................................. + // sub v24.4s, v9.4s, v11.4s // ...................................|.........................................................................................*................................................................. + // add v9.4s, v9.4s, v11.4s // ...................................|..........................................................................................*................................................................ + // mul v11.4s, v24.4s, v0.s[2] // ...................................|.............................................................................................*............................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................|..............................................................................................*............................................................ + // mls v11.4s, v24.4s, v8.s[0] // ...................................|..................................................................................................*........................................................ + // sub v24.4s, v10.4s, v12.4s // ...................................|...........................................................................................*............................................................... + // add v10.4s, v10.4s, v12.4s // ...................................|...............................................................................................*........................................................... + // mul v12.4s, v24.4s, v0.s[2] // ...................................|................................................................................................*.......................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................|.................................................................................................*......................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................................|.....................................................................................................*..................................................... + // sub v24.4s, v13.4s, v15.4s // ...................................|...................................................................................................*....................................................... + // add v13.4s, v13.4s, v15.4s // ...................................|....................................................................................................*...................................................... + // mul v15.4s, v24.4s, v1.s[0] // ...................................|......................................................................................................*.................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................|.......................................................................................................*................................................... + // mls v15.4s, v24.4s, v8.s[0] // ...................................|...........................................................................................................*............................................... + // sub v24.4s, v14.4s, v16.4s // ...................................|........................................................................................................*.................................................. + // add v14.4s, v14.4s, v16.4s // ...................................|.........................................................................................................*................................................. + // mul v16.4s, v24.4s, v1.s[0] // ...................................|............................................................................................................*.............................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................|.............................................................................................................*............................................. + // mls v16.4s, v24.4s, v8.s[0] // ...................................|.................................................................................................................*......................................... + // srshr v24.4S, v9.4S, #23 // ...................................|..........................................................................................................*................................................ + // mls v9.4s, v24.4s, v8.4s // ...................................|..............................................................................................................*............................................ + // srshr v24.4S, v10.4S, #23 // ...................................|...............................................................................................................*........................................... + // mls v10.4s, v24.4s, v8.4s // ...................................|..................................................................................................................*........................................ + // srshr v24.4S, v13.4S, #23 // ...................................|................................................................................................................*.......................................... + // mls v13.4s, v24.4s, v8.4s // ...................................|...................................................................................................................*....................................... + // srshr v24.4S, v14.4S, #23 // ...................................|....................................................................................................................*...................................... + // mls v14.4s, v24.4s, v8.4s // ...................................|.......................................................................................................................*................................... + // sub v24.4s, v9.4s, v13.4s // ...................................|........................................................................................................................*.................................. + // add v9.4s, v9.4s, v13.4s // ...................................|.........................................................................................................................*................................. + // mul v13.4s, v24.4s, v0.s[0] // ...................................|...........................................................................................................................*............................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|............................................................................................................................*.............................. + // mls v13.4s, v24.4s, v8.s[0] // ...................................|................................................................................................................................*.......................... + // sub v24.4s, v10.4s, v14.4s // ...................................|.............................................................................................................................*............................. + // add v10.4s, v10.4s, v14.4s // ...................................|..............................................................................................................................*............................ + // mul v14.4s, v24.4s, v0.s[0] // ...................................|.................................................................................................................................*......................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|..................................................................................................................................*........................ + // mls v14.4s, v24.4s, v8.s[0] // ...................................|......................................................................................................................................*.................... + // sub v24.4s, v11.4s, v15.4s // ...................................|.....................................................................................................................*..................................... + // add v11.4s, v11.4s, v15.4s // ...................................|......................................................................................................................*.................................... + // mul v15.4s, v24.4s, v0.s[0] // ...................................|..........................................................................................................................*................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|...............................................................................................................................*........................... + // mls v15.4s, v24.4s, v8.s[0] // ...................................|...................................................................................................................................*....................... + // sub v24.4s, v12.4s, v16.4s // ...................................|....................................................................................................................................*...................... + // add v12.4s, v12.4s, v16.4s // ...................................|.....................................................................................................................................*..................... + // mul v16.4s, v24.4s, v0.s[0] // ...................................|.......................................................................................................................................*................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|........................................................................................................................................*.................. + // mls v16.4s, v24.4s, v8.s[0] // .*.................................|...........................................................................................................................................*............... + // str q9, [x1], #(16*4) // ...................................|.........................................................................................................................................*................. + // str q10, [x1, #(-16*4 + 1*16)] // ..*................................|............................................................................................................................................*.............. + // str q11, [x1, #(-16*4 + 2*16)] // ....*..............................|..............................................................................................................................................*............ + // str q12, [x1, #(-16*4 + 3*16)] // ......*............................|................................................................................................................................................*.......... + // str q13, [x2], #(16*4) // .........*.........................|...................................................................................................................................................*....... + // str q14, [x2, #(-16*4 + 1*16)] // ...........*.......................|.....................................................................................................................................................*..... + // str q15, [x2, #(-16*4 + 2*16)] // .............*.....................|.......................................................................................................................................................*... + // str q16, [x2, #(-16*4 + 3*16)] // ...............*...................|.........................................................................................................................................................*. + // add x1, x1, #64 // .......*...........................|.................................................................................................................................................*......... + // add x2, x2, #64 // ................*..................|..........................................................................................................................................................* sub count, count, #1 cbnz count, layer45678_start - trn2 v0.4S, v30.4S, v4.4S // ..............*.............................................................................................................................. - // gap // ............................................................................................................................................. - trn1 v9.4S, v30.4S, v4.4S // .............*............................................................................................................................... - // gap // ............................................................................................................................................. - trn2 v15.4S, v26.4S, v17.4S // ............*................................................................................................................................ - // gap // ............................................................................................................................................. - trn1 v14.4S, v26.4S, v17.4S // ...........*................................................................................................................................. - // gap // ............................................................................................................................................. - trn1 v30.4S, v16.4S, v2.4S // *............................................................................................................................................ - // gap // ............................................................................................................................................. - trn2 v4.4S, v23.4S, v21.4S // ..*.......................................................................................................................................... - // gap // ............................................................................................................................................. - trn2 v26.4S, v16.4S, v2.4S // .*........................................................................................................................................... - // gap // ............................................................................................................................................. - trn1 v23.2D, v30.2D, v20.2D // ....*........................................................................................................................................ - // gap // ............................................................................................................................................. - trn1 v2.2D, v15.2D, v0.2D // ........................*.................................................................................................................... - // gap // ............................................................................................................................................. - trn1 v12.2D, v26.2D, v4.2D // ......*...................................................................................................................................... - // gap // ............................................................................................................................................. - trn2 v16.2D, v15.2D, v0.2D // ......................*...................................................................................................................... - // gap // ............................................................................................................................................. - trn2 v0.2D, v14.2D, v9.2D // .....................*....................................................................................................................... - // gap // ............................................................................................................................................. - sub v21.4S, v23.4S, v12.4S // .........*................................................................................................................................... - // gap // ............................................................................................................................................. - sub v15.4S, v0.4S, v16.4S // .........................................*................................................................................................... - // gap // ............................................................................................................................................. - add v31.4S, v23.4S, v12.4S // ..........*.................................................................................................................................. - // gap // ............................................................................................................................................. - add v17.4S, v0.4S, v16.4S // ..........................................*.................................................................................................. - // gap // ............................................................................................................................................. - sqrdmulh v16.4S, v21.4S, v11.4S // ....................*........................................................................................................................ - // gap // ............................................................................................................................................. - mul v25.4S, v21.4S, v25.4S // ...................*......................................................................................................................... - // gap // ............................................................................................................................................. - sqrdmulh v23.4S, v15.4S, v24.4S // ..............................................*.............................................................................................. - // gap // ............................................................................................................................................. - trn1 v21.2D, v14.2D, v9.2D // .......................*..................................................................................................................... - // gap // ............................................................................................................................................. - trn2 v11.2D, v26.2D, v4.2D // .....*....................................................................................................................................... - // gap // ............................................................................................................................................. - add v0.4S, v21.4S, v2.4S // ...................................*......................................................................................................... - // gap // ............................................................................................................................................. - mls v25.4S, v16.4S, v8.S[0] // .........................*................................................................................................................... - // gap // ............................................................................................................................................. - sub v9.4S, v21.4S, v2.4S // .............................*............................................................................................................... - // gap // ............................................................................................................................................. - add v21.4S, v0.4S, v17.4S // ................................................*............................................................................................ - // gap // ............................................................................................................................................. - sub v0.4S, v0.4S, v17.4S // ...............................................*............................................................................................. - // gap // ............................................................................................................................................. - trn2 v12.2D, v30.2D, v20.2D // ...*......................................................................................................................................... - // gap // ............................................................................................................................................. - sqrdmulh v4.4S, v9.4S, v18.4S // ........................................*.................................................................................................... - // gap // ............................................................................................................................................. - mul v30.4S, v0.4S, v19.4S // ...................................................*......................................................................................... - // gap // ............................................................................................................................................. - sqrdmulh v16.4S, v0.4S, v1.4S // ....................................................*........................................................................................ - // gap // ............................................................................................................................................. - sub v18.4S, v12.4S, v11.4S // .......*..................................................................................................................................... - // gap // ............................................................................................................................................. - mul v20.4S, v9.4S, v6.4S // ....................................*........................................................................................................ - // gap // ............................................................................................................................................. - mul v26.4S, v15.4S, v29.4S // .............................................*............................................................................................... - // gap // ............................................................................................................................................. - mul v6.4S, v18.4S, v3.4S // ...............*............................................................................................................................. - // gap // ............................................................................................................................................. - sqrdmulh v17.4S, v18.4S, v10.4S // ................*............................................................................................................................ - // gap // ............................................................................................................................................. - mls v20.4S, v4.4S, v8.S[0] // ............................................*................................................................................................ - // gap // ............................................................................................................................................. - mls v26.4S, v23.4S, v8.S[0] // ..................................................*.......................................................................................... - // gap // ............................................................................................................................................. - add v4.4S, v12.4S, v11.4S // ........*.................................................................................................................................... - // gap // ............................................................................................................................................. - mls v6.4S, v17.4S, v8.S[0] // ..........................*.................................................................................................................. - // gap // ............................................................................................................................................. - mls v30.4S, v16.4S, v8.S[0] // ........................................................*.................................................................................... - // gap // ............................................................................................................................................. - sub v16.4S, v20.4S, v26.4S // ......................................................*...................................................................................... - // gap // ............................................................................................................................................. - sub v17.4S, v31.4S, v4.4S // .................*........................................................................................................................... - // gap // ............................................................................................................................................. - sub v23.4S, v25.4S, v6.4S // ..............................*.............................................................................................................. - // gap // ............................................................................................................................................. - mul v0.4S, v16.4S, v19.4S // .........................................................*................................................................................... - // gap // ............................................................................................................................................. - mul v19.4S, v17.4S, v13.4S // ...........................*................................................................................................................. - // gap // ............................................................................................................................................. - sqrdmulh v2.4S, v23.4S, v7.4S // ..................................*.......................................................................................................... - // gap // ............................................................................................................................................. - mul v3.4S, v23.4S, v13.4S // .................................*........................................................................................................... - // gap // ............................................................................................................................................. - sqrdmulh v16.4S, v16.4S, v1.4S // ..........................................................*.................................................................................. - // gap // ............................................................................................................................................. - add v26.4S, v20.4S, v26.4S // .......................................................*..................................................................................... - // gap // ............................................................................................................................................. - add v14.4S, v25.4S, v6.4S // ...............................*............................................................................................................. - // gap // ............................................................................................................................................. - mls v3.4S, v2.4S, v8.S[0] // .......................................*..................................................................................................... - // gap // ............................................................................................................................................. - mls v0.4S, v16.4S, v8.S[0] // ..............................................................*.............................................................................. - // gap // ............................................................................................................................................. - sqrdmulh v10.4S, v17.4S, v7.4S // ............................*................................................................................................................ - // gap // ............................................................................................................................................. - trn1 v2.4S, v21.4S, v26.4S // ...............................................................*............................................................................. - // gap // ............................................................................................................................................. - add v20.4S, v31.4S, v4.4S // ..................*.......................................................................................................................... - // gap // ............................................................................................................................................. - trn1 v6.4S, v30.4S, v0.4S // ..................................................................*.......................................................................... - // gap // ............................................................................................................................................. - mls v19.4S, v10.4S, v8.S[0] // ................................*............................................................................................................ - // gap // ............................................................................................................................................. - trn2 v23.4S, v30.4S, v0.4S // ...................................................................*......................................................................... - // gap // ............................................................................................................................................. - trn1 v11.2D, v2.2D, v6.2D // .......................................................................*..................................................................... - // gap // ............................................................................................................................................. - trn2 v12.4S, v21.4S, v26.4S // ................................................................*............................................................................ - // gap // ............................................................................................................................................. - trn1 v21.4S, v19.4S, v3.4S // ...........................................*................................................................................................. - // gap // ............................................................................................................................................. - trn2 v30.4S, v19.4S, v3.4S // .................................................*........................................................................................... - // gap // ............................................................................................................................................. - trn2 v16.4S, v20.4S, v14.4S // ......................................*...................................................................................................... - // gap // ............................................................................................................................................. - trn1 v26.2D, v12.2D, v23.2D // ........................................................................*.................................................................... - // gap // ............................................................................................................................................. - trn2 v17.2D, v2.2D, v6.2D // .....................................................................*....................................................................... - // gap // ............................................................................................................................................. - trn1 v13.2D, v16.2D, v30.2D // .............................................................*............................................................................... - // gap // ............................................................................................................................................. - sub v0.4S, v11.4S, v26.4S // .............................................................................*............................................................... - // gap // ............................................................................................................................................. - trn1 v25.4S, v20.4S, v14.4S // .....................................*....................................................................................................... - // gap // ............................................................................................................................................. - trn2 v7.2D, v16.2D, v30.2D // ...........................................................*................................................................................. - // gap // ............................................................................................................................................. - sqrdmulh v2.4S, v0.4S, v5.S[3] // ...................................................................................*......................................................... - // gap // ............................................................................................................................................. - mul v20.4S, v0.4S, v5.S[2] // ..................................................................................*.......................................................... - // gap // ............................................................................................................................................. - trn2 v3.2D, v25.2D, v21.2D // .....................................................*....................................................................................... - // gap // ............................................................................................................................................. - trn1 v10.2D, v25.2D, v21.2D // ............................................................*................................................................................ - // gap // ............................................................................................................................................. - sub v16.4S, v3.4S, v7.4S // ...........................................................................*................................................................. - // gap // ............................................................................................................................................. - mls v20.4S, v2.4S, v8.S[0] // .......................................................................................*..................................................... - // gap // ............................................................................................................................................. - sub v30.4S, v10.4S, v13.4S // .................................................................*........................................................................... - // gap // ............................................................................................................................................. - sqrdmulh v0.4S, v16.4S, v5.S[1] // ................................................................................*............................................................ - // gap // ............................................................................................................................................. - mul v16.4S, v16.4S, v5.S[0] // ...............................................................................*............................................................. - // gap // ............................................................................................................................................. - sqrdmulh v21.4S, v30.4S, v28.S[3] // ..........................................................................*.................................................................. - // gap // ............................................................................................................................................. - trn2 v2.2D, v12.2D, v23.2D // ......................................................................*...................................................................... - // gap // ............................................................................................................................................. - mul v25.4S, v30.4S, v28.S[2] // .........................................................................*................................................................... - // gap // ............................................................................................................................................. - sub v23.4S, v17.4S, v2.4S // .....................................................................................*....................................................... - // gap // ............................................................................................................................................. - add v17.4S, v17.4S, v2.4S // ......................................................................................*...................................................... - // gap // ............................................................................................................................................. - mls v16.4S, v0.4S, v8.S[0] // ....................................................................................*........................................................ - // gap // ............................................................................................................................................. - sqrdmulh v2.4S, v23.4S, v22.S[1] // .........................................................................................*................................................... - // gap // ............................................................................................................................................. - mul v23.4S, v23.4S, v22.S[0] // ........................................................................................*.................................................... - // gap // ............................................................................................................................................. - add v4.4S, v11.4S, v26.4S // .................................................................................*........................................................... - // gap // ............................................................................................................................................. - mls v25.4S, v21.4S, v8.S[0] // ..............................................................................*.............................................................. - // gap // ............................................................................................................................................. - add v3.4S, v3.4S, v7.4S // ............................................................................*................................................................ - // gap // ............................................................................................................................................. - mls v23.4S, v2.4S, v8.S[0] // .............................................................................................*............................................... - // gap // ............................................................................................................................................. - add v30.4S, v4.4S, v17.4S // .....................................................................................................*....................................... - // gap // ............................................................................................................................................. - sub v26.4S, v25.4S, v16.4S // ............................................................................................*................................................ - // gap // ............................................................................................................................................. - add v2.4S, v25.4S, v16.4S // ................................................................................................*............................................ - // gap // ............................................................................................................................................. - add v16.4S, v20.4S, v23.4S // ..........................................................................................................*.................................. - // gap // ............................................................................................................................................. - sqrdmulh v21.4S, v26.4S, v27.S[3] // ..................................................................................................*.......................................... - // gap // ............................................................................................................................................. - mul v26.4S, v26.4S, v27.S[2] // .................................................................................................*........................................... - // gap // ............................................................................................................................................. - add v0.4S, v2.4S, v16.4S // ....................................................................................................................*........................ - // gap // ............................................................................................................................................. - add v25.4S, v10.4S, v13.4S // ....................................................................*........................................................................ - // gap // ............................................................................................................................................. - sub v23.4S, v20.4S, v23.4S // .........................................................................................................*................................... - // gap // ............................................................................................................................................. - str q0, [x1, #16] // ....................................................................................................................................*........ - // gap // ............................................................................................................................................. - mls v26.4S, v21.4S, v8.S[0] // ......................................................................................................*...................................... - // gap // ............................................................................................................................................. - sub v20.4S, v25.4S, v3.4S // ..........................................................................................*.................................................. - // gap // ............................................................................................................................................. - sqrdmulh v0.4S, v23.4S, v28.S[1] // ..............................................................................................................*.............................. - // gap // ............................................................................................................................................. - mul v23.4S, v23.4S, v28.S[0] // .............................................................................................................*............................... - // gap // ............................................................................................................................................. - sqrdmulh v21.4S, v20.4S, v27.S[3] // ...............................................................................................*............................................. - // gap // ............................................................................................................................................. - sub v17.4S, v4.4S, v17.4S // ....................................................................................................*........................................ - // gap // ............................................................................................................................................. - mul v11.4S, v20.4S, v27.S[2] // ..............................................................................................*.............................................. - // gap // ............................................................................................................................................. - mls v23.4S, v0.4S, v8.S[0] // ..................................................................................................................*.......................... - // gap // ............................................................................................................................................. - sub v16.4S, v2.4S, v16.4S // ...................................................................................................................*......................... - // gap // ............................................................................................................................................. - sqrdmulh v20.4S, v17.4S, v28.S[1] // ........................................................................................................*.................................... - // gap // ............................................................................................................................................. - mul v17.4S, v17.4S, v28.S[0] // .......................................................................................................*..................................... - // gap // ............................................................................................................................................. - sub v4.4S, v26.4S, v23.4S // ..........................................................................................................................*.................. - // gap // ............................................................................................................................................. - add v10.4S, v26.4S, v23.4S // ..............................................................................................................................*.............. - // gap // ............................................................................................................................................. - mls v11.4S, v21.4S, v8.S[0] // ...................................................................................................*......................................... - // gap // ............................................................................................................................................. - sqrdmulh v2.4S, v4.4S, v27.S[1] // ................................................................................................................................*............ - // gap // ............................................................................................................................................. - mul v0.4S, v4.4S, v27.S[0] // ...............................................................................................................................*............. - // gap // ............................................................................................................................................. - mls v17.4S, v20.4S, v8.S[0] // ............................................................................................................*................................ - // gap // ............................................................................................................................................. - sqrdmulh v23.4S, v16.4S, v27.S[1] // .......................................................................................................................*..................... - // gap // ............................................................................................................................................. - mul v26.4S, v16.4S, v27.S[0] // ......................................................................................................................*...................... - // gap // ............................................................................................................................................. - add v25.4S, v25.4S, v3.4S // ...........................................................................................*................................................. - // gap // ............................................................................................................................................. - sub v21.4S, v11.4S, v17.4S // ........................................................................................................................*.................... - // gap // ............................................................................................................................................. - add v16.4S, v11.4S, v17.4S // .........................................................................................................................*................... - // gap // ............................................................................................................................................. - mls v26.4S, v23.4S, v8.S[0] // ...........................................................................................................................*................. - // gap // ............................................................................................................................................. - sqrdmulh v29.4S, v21.4S, v27.S[1] // .............................................................................................................................*............... - // gap // ............................................................................................................................................. - sub v4.4S, v25.4S, v30.4S // ...........................................................................................................*................................. - // gap // ............................................................................................................................................. - add v17.4S, v25.4S, v30.4S // ...............................................................................................................*............................. - // gap // ............................................................................................................................................. - str q16, [x1, #32] // .....................................................................................................................................*....... - // gap // ............................................................................................................................................. - sqrdmulh v20.4S, v4.4S, v27.S[1] // .................................................................................................................*........................... - // gap // ............................................................................................................................................. - mul v23.4S, v4.4S, v27.S[0] // ................................................................................................................*............................ - // gap // ............................................................................................................................................. - str q17, [x1], #(16*4) // ..................................................................................................................................*.......... - // gap // ............................................................................................................................................. - mul v16.4S, v21.4S, v27.S[0] // ............................................................................................................................*................ - // gap // ............................................................................................................................................. - str q10, [x1, #-16] // ......................................................................................................................................*...... - add x1, x1, #64 // .......................................................................................................................................*..... - mls v23.4S, v20.4S, v8.S[0] // .....................................................................................................................*....................... - // gap // ............................................................................................................................................. - str q26, [x2, #16] // .........................................................................................................................................*... - // gap // ............................................................................................................................................. - mls v16.4S, v29.4S, v8.S[0] // .................................................................................................................................*........... - // gap // ............................................................................................................................................. - mls v0.4S, v2.4S, v8.S[0] // ...................................................................................................................................*......... - // gap // ............................................................................................................................................. - str q23, [x2], #(16*4) // ........................................................................................................................................*.... - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - str q16, [x2, #-32] // ..........................................................................................................................................*.. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - str q0, [x2, #-16] // ...........................................................................................................................................*. - add x2, x2, #64 // ............................................................................................................................................* + trn1 v0.4S, v30.4S, v24.4S // *.................................................................................................................................................... + // gap // ..................................................................................................................................................... + trn2 v10.4S, v30.4S, v24.4S // .*................................................................................................................................................... + // gap // ..................................................................................................................................................... + trn2 v7.4S, v7.4S, v1.4S // ..*.................................................................................................................................................. + // gap // ..................................................................................................................................................... + trn2 v20.2D, v0.2D, v6.2D // ...*................................................................................................................................................. + // gap // ..................................................................................................................................................... + trn1 v22.2D, v0.2D, v6.2D // ....*................................................................................................................................................ + // gap // ..................................................................................................................................................... + trn1 v24.2D, v10.2D, v7.2D // ......*.............................................................................................................................................. + // gap // ..................................................................................................................................................... + trn2 v10.2D, v10.2D, v7.2D // .....*............................................................................................................................................... + // gap // ..................................................................................................................................................... + sub v1.4S, v22.4S, v24.4S // .........*........................................................................................................................................... + // gap // ..................................................................................................................................................... + sub v30.4S, v20.4S, v10.4S // .......*............................................................................................................................................. + // gap // ..................................................................................................................................................... + add v7.4S, v20.4S, v10.4S // ........*............................................................................................................................................ + // gap // ..................................................................................................................................................... + sqrdmulh v10.4S, v1.4S, v18.4S // ....................*................................................................................................................................ + // gap // ..................................................................................................................................................... + mul v26.4S, v30.4S, v11.4S // ...............*..................................................................................................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v6.4S, v30.4S, v19.4S // ................*.................................................................................................................................... + // gap // ..................................................................................................................................................... + mul v18.4S, v1.4S, v28.4S // ...................*................................................................................................................................. + // gap // ..................................................................................................................................................... + trn2 v16.4S, v13.4S, v12.4S // ............*........................................................................................................................................ + // gap // ..................................................................................................................................................... + add v0.4S, v22.4S, v24.4S // ..........*.......................................................................................................................................... + // gap // ..................................................................................................................................................... + mls v26.4S, v6.4S, v8.S[0] // ..........................*.......................................................................................................................... + // gap // ..................................................................................................................................................... + mls v18.4S, v10.4S, v8.S[0] // .........................*........................................................................................................................... + // gap // ..................................................................................................................................................... + trn2 v28.4S, v17.4S, v27.4S // ..............*...................................................................................................................................... + // gap // ..................................................................................................................................................... + sub v11.4S, v0.4S, v7.4S // .................*................................................................................................................................... + // gap // ..................................................................................................................................................... + trn1 v30.4S, v17.4S, v27.4S // .............*....................................................................................................................................... + // gap // ..................................................................................................................................................... + sub v1.4S, v18.4S, v26.4S // ..............................*...................................................................................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v19.4S, v11.4S, v21.4S // ............................*........................................................................................................................ + // gap // ..................................................................................................................................................... + mul v24.4S, v11.4S, v15.4S // ...........................*......................................................................................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v17.4S, v1.4S, v21.4S // ..................................*.................................................................................................................. + // gap // ..................................................................................................................................................... + mul v6.4S, v1.4S, v15.4S // .................................*................................................................................................................... + // gap // ..................................................................................................................................................... + add v1.4S, v18.4S, v26.4S // ...............................*..................................................................................................................... + // gap // ..................................................................................................................................................... + add v27.4S, v0.4S, v7.4S // ..................*.................................................................................................................................. + // gap // ..................................................................................................................................................... + mls v24.4S, v19.4S, v8.S[0] // ................................*.................................................................................................................... + // gap // ..................................................................................................................................................... + mls v6.4S, v17.4S, v8.S[0] // .......................................*............................................................................................................. + // gap // ..................................................................................................................................................... + trn1 v15.4S, v27.4S, v1.4S // .....................................*............................................................................................................... + // gap // ..................................................................................................................................................... + trn2 v11.2D, v16.2D, v28.2D // ......................*.............................................................................................................................. + // gap // ..................................................................................................................................................... + trn2 v7.4S, v27.4S, v1.4S // ......................................*.............................................................................................................. + // gap // ..................................................................................................................................................... + trn2 v10.4S, v24.4S, v6.4S // .................................................*................................................................................................... + // gap // ..................................................................................................................................................... + trn1 v24.4S, v24.4S, v6.4S // ...........................................*......................................................................................................... + // gap // ..................................................................................................................................................... + trn1 v1.4S, v13.4S, v12.4S // ...........*......................................................................................................................................... + // gap // ..................................................................................................................................................... + trn1 v0.2D, v7.2D, v10.2D // .............................................................*....................................................................................... + // gap // ..................................................................................................................................................... + trn2 v22.2D, v7.2D, v10.2D // ...........................................................*......................................................................................... + // gap // ..................................................................................................................................................... + trn2 v27.2D, v15.2D, v24.2D // .....................................................*............................................................................................... + // gap // ..................................................................................................................................................... + trn1 v7.2D, v15.2D, v24.2D // ............................................................*........................................................................................ + // gap // ..................................................................................................................................................... + add v6.4S, v27.4S, v22.4S // ............................................................................*........................................................................ + // gap // ..................................................................................................................................................... + add v18.4S, v7.4S, v0.4S // ....................................................................*................................................................................ + // gap // ..................................................................................................................................................... + trn1 v28.2D, v16.2D, v28.2D // ........................*............................................................................................................................ + // gap // ..................................................................................................................................................... + trn1 v15.2D, v1.2D, v30.2D // .......................*............................................................................................................................. + // gap // ..................................................................................................................................................... + add v17.4S, v18.4S, v6.4S // ...........................................................................................*......................................................... + // gap // ..................................................................................................................................................... + sub v10.4S, v15.4S, v28.4S // .............................*....................................................................................................................... + // gap // ..................................................................................................................................................... + trn2 v13.2D, v1.2D, v30.2D // .....................*............................................................................................................................... + // gap // ..................................................................................................................................................... + srshr v26.4S, v17.4S, #23 // ...........................................................................................................*......................................... + // gap // ..................................................................................................................................................... + sqrdmulh v30.4S, v10.4S, v14.4S // ........................................*............................................................................................................ + // gap // ..................................................................................................................................................... + mul v21.4S, v10.4S, v25.4S // ....................................*................................................................................................................ + // gap // ..................................................................................................................................................... + mls v17.4S, v26.4S, v8.4S // ...............................................................................................................*..................................... + // gap // ..................................................................................................................................................... + sub v10.4S, v13.4S, v11.4S // .........................................*........................................................................................................... + // gap // ..................................................................................................................................................... + sub v1.4S, v7.4S, v0.4S // .................................................................*................................................................................... + // gap // ..................................................................................................................................................... + mls v21.4S, v30.4S, v8.S[0] // ............................................*........................................................................................................ + // gap // ..................................................................................................................................................... + mul v7.4S, v10.4S, v5.4S // .............................................*....................................................................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v24.4S, v10.4S, v29.4S // ..............................................*...................................................................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v10.4S, v1.4S, v9.S[3] // ..........................................................................*.......................................................................... + // gap // ..................................................................................................................................................... + add v30.4S, v13.4S, v11.4S // ..........................................*.......................................................................................................... + // gap // ..................................................................................................................................................... + add v13.4S, v15.4S, v28.4S // ...................................*................................................................................................................. + // gap // ..................................................................................................................................................... + mls v7.4S, v24.4S, v8.S[0] // ..................................................*.................................................................................................. + // gap // ..................................................................................................................................................... + sub v0.4S, v27.4S, v22.4S // ...........................................................................*......................................................................... + // gap // ..................................................................................................................................................... + sub v24.4S, v13.4S, v30.4S // ...............................................*..................................................................................................... + // gap // ..................................................................................................................................................... + add v15.4S, v13.4S, v30.4S // ................................................*.................................................................................................... + // gap // ..................................................................................................................................................... + sub v12.4S, v21.4S, v7.4S // ......................................................*.............................................................................................. + // gap // ..................................................................................................................................................... + sqrdmulh v13.4S, v24.4S, v4.4S // ....................................................*................................................................................................ + // gap // ..................................................................................................................................................... + mul v27.4S, v24.4S, v2.4S // ...................................................*................................................................................................. + // gap // ..................................................................................................................................................... + sqrdmulh v14.4S, v12.4S, v4.4S // ..........................................................*.......................................................................................... + // gap // ..................................................................................................................................................... + mul v28.4S, v12.4S, v2.4S // .........................................................*........................................................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v30.4S, v0.4S, v31.S[1] // ................................................................................*.................................................................... + // gap // ..................................................................................................................................................... + add v24.4S, v21.4S, v7.4S // .......................................................*............................................................................................. + // gap // ..................................................................................................................................................... + mls v27.4S, v13.4S, v8.S[0] // ........................................................*............................................................................................ + // gap // ..................................................................................................................................................... + mls v28.4S, v14.4S, v8.S[0] // ..............................................................*...................................................................................... + // gap // ..................................................................................................................................................... + trn1 v13.4S, v15.4S, v24.4S // ...............................................................*..................................................................................... + // gap // ..................................................................................................................................................... + mul v7.4S, v0.4S, v31.S[0] // ...............................................................................*..................................................................... + // gap // ..................................................................................................................................................... + trn2 v24.4S, v15.4S, v24.4S // ................................................................*.................................................................................... + // gap // ..................................................................................................................................................... + trn2 v0.4S, v27.4S, v28.4S // ...................................................................*................................................................................. + // gap // ..................................................................................................................................................... + trn1 v15.4S, v27.4S, v28.4S // ..................................................................*.................................................................................. + // gap // ..................................................................................................................................................... + mls v7.4S, v30.4S, v8.S[0] // ....................................................................................*................................................................ + // gap // ..................................................................................................................................................... + trn1 v12.2D, v24.2D, v0.2D // ........................................................................*............................................................................ + // gap // ..................................................................................................................................................... + trn2 v28.2D, v24.2D, v0.2D // ......................................................................*.............................................................................. + // gap // ..................................................................................................................................................... + trn2 v21.2D, v13.2D, v15.2D // .....................................................................*............................................................................... + // gap // ..................................................................................................................................................... + trn1 v11.2D, v13.2D, v15.2D // .......................................................................*............................................................................. + // gap // ..................................................................................................................................................... + sub v30.4S, v21.4S, v28.4S // .....................................................................................*............................................................... + // gap // ..................................................................................................................................................... + sub v24.4S, v11.4S, v12.4S // .............................................................................*....................................................................... + // gap // ..................................................................................................................................................... + mul v15.4S, v1.4S, v9.S[2] // .........................................................................*........................................................................... + // gap // ..................................................................................................................................................... + mul v13.4S, v30.4S, v3.S[0] // ........................................................................................*............................................................ + // gap // ..................................................................................................................................................... + sqrdmulh v0.4S, v24.4S, v31.S[3] // ...................................................................................*................................................................. + // gap // ..................................................................................................................................................... + mul v1.4S, v24.4S, v31.S[2] // ..................................................................................*.................................................................. + // gap // ..................................................................................................................................................... + mls v15.4S, v10.4S, v8.S[0] // ..............................................................................*...................................................................... + // gap // ..................................................................................................................................................... + add v27.4S, v21.4S, v28.4S // ......................................................................................*.............................................................. + // gap // ..................................................................................................................................................... + sqrdmulh v30.4S, v30.4S, v3.S[1] // .........................................................................................*........................................................... + // gap // ..................................................................................................................................................... + mls v1.4S, v0.4S, v8.S[0] // .......................................................................................*............................................................. + // gap // ..................................................................................................................................................... + sub v10.4S, v15.4S, v7.4S // ............................................................................................*........................................................ + // gap // ..................................................................................................................................................... + sub v0.4S, v18.4S, v6.4S // ..........................................................................................*.......................................................... + // gap // ..................................................................................................................................................... + add v28.4S, v11.4S, v12.4S // .................................................................................*................................................................... + // gap // ..................................................................................................................................................... + mls v13.4S, v30.4S, v8.S[0] // .............................................................................................*....................................................... + // gap // ..................................................................................................................................................... + mul v6.4S, v0.4S, v23.S[2] // ..............................................................................................*...................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v30.4S, v0.4S, v23.S[3] // ...............................................................................................*..................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v24.4S, v10.4S, v23.S[3] // ..................................................................................................*.................................................. + // gap // ..................................................................................................................................................... + sub v0.4S, v1.4S, v13.4S // .........................................................................................................*........................................... + // gap // ..................................................................................................................................................... + mul v12.4S, v10.4S, v23.S[2] // .................................................................................................*................................................... + // gap // ..................................................................................................................................................... + add v7.4S, v15.4S, v7.4S // ................................................................................................*.................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v31.4S, v0.4S, v9.S[1] // ..............................................................................................................*...................................... + // gap // ..................................................................................................................................................... + mls v6.4S, v30.4S, v8.S[0] // ...................................................................................................*................................................. + // gap // ..................................................................................................................................................... + mls v12.4S, v24.4S, v8.S[0] // ......................................................................................................*.............................................. + // gap // ..................................................................................................................................................... + mul v3.4S, v0.4S, v9.S[0] // .............................................................................................................*....................................... + // gap // ..................................................................................................................................................... + add v30.4S, v1.4S, v13.4S // ..........................................................................................................*.......................................... + // gap // ..................................................................................................................................................... + srshr v13.4S, v7.4S, #23 // ................................................................................................................*.................................... + // gap // ..................................................................................................................................................... + add v2.4S, v28.4S, v27.4S // .....................................................................................................*............................................... + // gap // ..................................................................................................................................................... + mls v3.4S, v31.4S, v8.S[0] // ..................................................................................................................*.................................. + // gap // ..................................................................................................................................................... + mls v7.4S, v13.4S, v8.4S // ...................................................................................................................*................................. + // gap // ..................................................................................................................................................... + srshr v10.4S, v30.4S, #23 // .....................................................................................................................*............................... + // gap // ..................................................................................................................................................... + srshr v24.4S, v2.4S, #23 // .................................................................................................................*................................... + // gap // ..................................................................................................................................................... + add v0.4S, v12.4S, v3.4S // ......................................................................................................................................*.............. + // gap // ..................................................................................................................................................... + mls v30.4S, v10.4S, v8.4S // ........................................................................................................................*............................ + // gap // ..................................................................................................................................................... + mls v2.4S, v24.4S, v8.4S // ....................................................................................................................*................................ + // gap // ..................................................................................................................................................... + str q0, [x1, #48] // ..............................................................................................................................................*...... + // gap // ..................................................................................................................................................... + sub v19.4S, v28.4S, v27.4S // ....................................................................................................*................................................ + // gap // ..................................................................................................................................................... + sub v0.4S, v7.4S, v30.4S // ..............................................................................................................................*...................... + // gap // ..................................................................................................................................................... + add v7.4S, v7.4S, v30.4S // ...............................................................................................................................*..................... + // gap // ..................................................................................................................................................... + sub v24.4S, v17.4S, v2.4S // .........................................................................................................................*........................... + // gap // ..................................................................................................................................................... + sqrdmulh v30.4S, v0.4S, v23.S[1] // ...................................................................................................................................*................. + // gap // ..................................................................................................................................................... + mul v0.4S, v0.4S, v23.S[0] // ..................................................................................................................................*.................. + // gap // ..................................................................................................................................................... + sqrdmulh v10.4S, v24.4S, v23.S[1] // .............................................................................................................................*....................... + // gap // ..................................................................................................................................................... + str q7, [x1, #16] // ............................................................................................................................................*........ + // gap // ..................................................................................................................................................... + mul v7.4S, v24.4S, v23.S[0] // ............................................................................................................................*........................ + // gap // ..................................................................................................................................................... + mls v0.4S, v30.4S, v8.S[0] // .......................................................................................................................................*............. + // gap // ..................................................................................................................................................... + mul v13.4S, v19.4S, v9.S[0] // .......................................................................................................*............................................. + // gap // ..................................................................................................................................................... + sqrdmulh v1.4S, v19.4S, v9.S[1] // ........................................................................................................*............................................ + // gap // ..................................................................................................................................................... + mls v7.4S, v10.4S, v8.S[0] // .................................................................................................................................*................... + // gap // ..................................................................................................................................................... + str q0, [x2, #16] // .................................................................................................................................................*... + // gap // ..................................................................................................................................................... + sub v0.4S, v12.4S, v3.4S // .....................................................................................................................................*............... + // gap // ..................................................................................................................................................... + mls v13.4S, v1.4S, v8.S[0] // ............................................................................................................*........................................ + // gap // ..................................................................................................................................................... + add v24.4S, v17.4S, v2.4S // ..........................................................................................................................*.......................... + // gap // ..................................................................................................................................................... + str q7, [x2], #(16*4) // ................................................................................................................................................*.... + // gap // ..................................................................................................................................................... + sqrdmulh v30.4S, v0.4S, v23.S[1] // .........................................................................................................................................*........... + // gap // ..................................................................................................................................................... + sub v7.4S, v6.4S, v13.4S // ......................................................................................................................*.............................. + // gap // ..................................................................................................................................................... + mul v0.4S, v0.4S, v23.S[0] // ........................................................................................................................................*............ + // gap // ..................................................................................................................................................... + add v1.4S, v6.4S, v13.4S // .......................................................................................................................*............................. + // gap // ..................................................................................................................................................... + sqrdmulh v10.4S, v7.4S, v23.S[1] // ................................................................................................................................*.................... + // gap // ..................................................................................................................................................... + mul v7.4S, v7.4S, v23.S[0] // ...........................................................................................................................*......................... + // gap // ..................................................................................................................................................... + str q1, [x1, #32] // .............................................................................................................................................*....... + // gap // ..................................................................................................................................................... + mls v0.4S, v30.4S, v8.S[0] // ...........................................................................................................................................*......... + // gap // ..................................................................................................................................................... + str q24, [x1], #(16*4) // ..........................................................................................................................................*.......... + add x1, x1, #64 // ...............................................................................................................................................*..... + mls v7.4S, v10.4S, v8.S[0] // ....................................................................................................................................*................ + // gap // ..................................................................................................................................................... + // gap // ..................................................................................................................................................... + // gap // ..................................................................................................................................................... + str q0, [x2, #-16] // ...................................................................................................................................................*. + // gap // ..................................................................................................................................................... + // gap // ..................................................................................................................................................... + // gap // ..................................................................................................................................................... + str q7, [x2, #-32] // ..................................................................................................................................................*.. + add x2, x2, #64 // ....................................................................................................................................................* // original source code - // trn1 v0.4S, v16.4S, v2.4S // ....*........................................................................................................................................ - // trn2 v2.4S, v16.4S, v2.4S // ......*...................................................................................................................................... - // trn2 v16.4S, v23.4S, v21.4S // .....*....................................................................................................................................... - // trn2 v23.2D, v0.2D, v20.2D // ..........................*.................................................................................................................. - // trn1 v0.2D, v0.2D, v20.2D // .......*..................................................................................................................................... - // trn2 v21.2D, v2.2D, v16.2D // ....................*........................................................................................................................ - // trn1 v2.2D, v2.2D, v16.2D // .........*................................................................................................................................... - // sub v16.4S, v23.4S, v21.4S // ..............................*.............................................................................................................. - // add v23.4S, v23.4S, v21.4S // .....................................*....................................................................................................... - // sub v21.4S, v0.4S, v2.4S // ............*................................................................................................................................ - // add v0.4S, v0.4S, v2.4S // ..............*.............................................................................................................................. - // trn1 v2.4S, v26.4S, v17.4S // ...*......................................................................................................................................... - // trn2 v26.4S, v26.4S, v17.4S // ..*.......................................................................................................................................... - // trn1 v20.4S, v30.4S, v4.4S // .*........................................................................................................................................... - // trn2 v17.4S, v30.4S, v4.4S // *............................................................................................................................................ - // mul v30.4S, v16.4S, v3.4S // .................................*........................................................................................................... - // sqrdmulh v16.4S, v16.4S, v10.4S // ..................................*.......................................................................................................... - // sub v4.4S, v0.4S, v23.4S // .........................................*................................................................................................... - // add v0.4S, v0.4S, v23.4S // ......................................................*...................................................................................... - // mul v23.4S, v21.4S, v25.4S // .................*........................................................................................................................... - // sqrdmulh v21.4S, v21.4S, v11.4S // ................*............................................................................................................................ - // trn2 v25.2D, v2.2D, v20.2D // ...........*................................................................................................................................. - // trn2 v11.2D, v26.2D, v17.2D // ..........*.................................................................................................................................. - // trn1 v2.2D, v2.2D, v20.2D // ...................*......................................................................................................................... - // trn1 v26.2D, v26.2D, v17.2D // ........*.................................................................................................................................... - // mls v23.4S, v21.4S, v8.S[0] // ......................*...................................................................................................................... - // mls v30.4S, v16.4S, v8.S[0] // ......................................*...................................................................................................... - // mul v16.4S, v4.4S, v13.4S // ............................................*................................................................................................ - // sqrdmulh v21.4S, v4.4S, v7.4S // ....................................................*........................................................................................ - // sub v20.4S, v2.4S, v26.4S // .......................*..................................................................................................................... - // sub v17.4S, v23.4S, v30.4S // ..........................................*.................................................................................................. - // add v23.4S, v23.4S, v30.4S // .................................................*........................................................................................... - // mls v16.4S, v21.4S, v8.S[0] // ........................................................*.................................................................................... - // mul v21.4S, v17.4S, v13.4S // ..............................................*.............................................................................................. - // sqrdmulh v17.4S, v17.4S, v7.4S // .............................................*............................................................................................... - // add v2.4S, v2.4S, v26.4S // .....................*....................................................................................................................... - // mul v26.4S, v20.4S, v6.4S // ...............................*............................................................................................................. - // trn1 v30.4S, v0.4S, v23.4S // ...................................................................*......................................................................... - // trn2 v0.4S, v0.4S, v23.4S // ..............................................................*.............................................................................. - // mls v21.4S, v17.4S, v8.S[0] // ..................................................*.......................................................................................... - // sqrdmulh v23.4S, v20.4S, v18.4S // ...........................*................................................................................................................. - // sub v20.4S, v25.4S, v11.4S // .............*............................................................................................................................... - // add v17.4S, v25.4S, v11.4S // ...............*............................................................................................................................. - // trn1 v4.4S, v16.4S, v21.4S // ............................................................*................................................................................ - // mls v26.4S, v23.4S, v8.S[0] // ...................................*......................................................................................................... - // mul v23.4S, v20.4S, v29.4S // ................................*............................................................................................................ - // sqrdmulh v20.4S, v20.4S, v24.4S // ..................*.......................................................................................................................... - // sub v25.4S, v2.4S, v17.4S // .........................*................................................................................................................... - // add v2.4S, v2.4S, v17.4S // ........................*.................................................................................................................... - // trn2 v16.4S, v16.4S, v21.4S // .............................................................*............................................................................... - // mls v23.4S, v20.4S, v8.S[0] // ....................................*........................................................................................................ - // mul v21.4S, v25.4S, v19.4S // ............................*................................................................................................................ - // sqrdmulh v20.4S, v25.4S, v1.4S // .............................*............................................................................................................... - // trn2 v17.2D, v30.2D, v4.2D // .......................................................................*..................................................................... - // sub v25.4S, v26.4S, v23.4S // ........................................*.................................................................................................... - // add v23.4S, v26.4S, v23.4S // ................................................*............................................................................................ - // mls v21.4S, v20.4S, v8.S[0] // .......................................*..................................................................................................... - // mul v26.4S, v25.4S, v19.4S // ...........................................*................................................................................................. - // sqrdmulh v20.4S, v25.4S, v1.4S // ...............................................*............................................................................................. - // trn2 v25.2D, v0.2D, v16.2D // ....................................................................*........................................................................ - // trn1 v30.2D, v30.2D, v4.2D // ........................................................................*.................................................................... - // trn1 v0.2D, v0.2D, v16.2D // .................................................................*........................................................................... - // mls v26.4S, v20.4S, v8.S[0] // ...................................................*......................................................................................... - // trn1 v16.4S, v2.4S, v23.4S // .....................................................*....................................................................................... - // trn2 v2.4S, v2.4S, v23.4S // ...........................................................*................................................................................. - // sub v23.4S, v30.4S, v0.4S // ...........................................................................*................................................................. - // trn1 v20.4S, v21.4S, v26.4S // .......................................................*..................................................................................... - // trn2 v21.4S, v21.4S, v26.4S // .........................................................*................................................................................... - // add v0.4S, v30.4S, v0.4S // .................................................................................................*........................................... - // trn2 v26.2D, v16.2D, v20.2D // ................................................................*............................................................................ - // trn2 v30.2D, v2.2D, v21.2D // ...............................................................................*............................................................. - // trn1 v16.2D, v16.2D, v20.2D // ..........................................................*.................................................................................. - // trn1 v2.2D, v2.2D, v21.2D // ...............................................................*............................................................................. - // mul v21.4S, v23.4S, v28.S[2] // ................................................................................*............................................................ - // sqrdmulh v23.4S, v23.4S, v28.S[3] // ..............................................................................*.............................................................. - // sub v20.4S, v17.4S, v25.4S // .........................................................................*................................................................... - // add v17.4S, v17.4S, v25.4S // ........................................................................................*.................................................... - // sub v4.4S, v16.4S, v2.4S // ..................................................................*.......................................................................... - // mls v21.4S, v23.4S, v8.S[0] // .......................................................................................*..................................................... - // mul v23.4S, v20.4S, v5.S[0] // .............................................................................*............................................................... - // sqrdmulh v20.4S, v20.4S, v5.S[1] // ............................................................................*................................................................ - // add v2.4S, v16.4S, v2.4S // ......................................................................................*...................................................... - // mul v16.4S, v4.4S, v5.S[2] // ......................................................................*...................................................................... - // sqrdmulh v4.4S, v4.4S, v5.S[3] // .....................................................................*....................................................................... - // mls v23.4S, v20.4S, v8.S[0] // ...................................................................................*......................................................... - // sub v20.4S, v26.4S, v30.4S // .................................................................................*........................................................... - // add v26.4S, v26.4S, v30.4S // ..................................................................................*.......................................................... - // mls v16.4S, v4.4S, v8.S[0] // ..........................................................................*.................................................................. - // mul v30.4S, v20.4S, v22.S[0] // .....................................................................................*....................................................... - // sqrdmulh v20.4S, v20.4S, v22.S[1] // ....................................................................................*........................................................ - // sub v4.4S, v0.4S, v17.4S // .....................................................................................................*....................................... - // add v0.4S, v0.4S, v17.4S // .......................................................................................................................*..................... - // sub v17.4S, v21.4S, v23.4S // ...........................................................................................*................................................. - // mls v30.4S, v20.4S, v8.S[0] // .........................................................................................*................................................... - // mul v20.4S, v4.4S, v27.S[2] // ..........................................................................................................*.................................. - // sqrdmulh v4.4S, v4.4S, v27.S[3] // ........................................................................................................*.................................... - // add v23.4S, v21.4S, v23.4S // ............................................................................................*................................................ - // mul v21.4S, v17.4S, v27.S[2] // ...............................................................................................*............................................. - // sqrdmulh v17.4S, v17.4S, v27.S[3] // ..............................................................................................*.............................................. - // mls v20.4S, v4.4S, v8.S[0] // .................................................................................................................*........................... - // sub v4.4S, v2.4S, v26.4S // .........................................................................................................*................................... - // add v2.4S, v2.4S, v26.4S // ..........................................................................................*.................................................. - // mls v21.4S, v17.4S, v8.S[0] // ....................................................................................................*........................................ - // mul v26.4S, v4.4S, v28.S[0] // ..............................................................................................................*.............................. - // sqrdmulh v17.4S, v4.4S, v28.S[1] // .............................................................................................................*............................... - // sub v4.4S, v16.4S, v30.4S // ..................................................................................................*.......................................... - // add v16.4S, v16.4S, v30.4S // .............................................................................................*............................................... - // sub v30.4S, v0.4S, v2.4S // ............................................................................................................................*................ - // mls v26.4S, v17.4S, v8.S[0] // ....................................................................................................................*........................ - // mul v17.4S, v4.4S, v28.S[0] // .......................................................................................................*..................................... - // sqrdmulh v4.4S, v4.4S, v28.S[1] // ......................................................................................................*...................................... - // add v0.4S, v0.4S, v2.4S // .............................................................................................................................*............... - // mul v2.4S, v30.4S, v27.S[0] // ................................................................................................................................*............ - // sqrdmulh v30.4S, v30.4S, v27.S[1] // ...............................................................................................................................*............. - // mls v17.4S, v4.4S, v8.S[0] // ...........................................................................................................*................................. - // sub v4.4S, v23.4S, v16.4S // ............................................................................................................*................................ - // add v16.4S, v23.4S, v16.4S // ................................................................................................*............................................ - // mls v2.4S, v30.4S, v8.S[0] // .....................................................................................................................................*....... - // mul v23.4S, v4.4S, v27.S[0] // ......................................................................................................................*...................... - // sqrdmulh v30.4S, v4.4S, v27.S[1] // .....................................................................................................................*....................... - // sub v4.4S, v20.4S, v26.4S // ........................................................................................................................*.................... - // add v26.4S, v20.4S, v26.4S // .........................................................................................................................*................... - // sub v20.4S, v21.4S, v17.4S // ...............................................................................................................*............................. - // mls v23.4S, v30.4S, v8.S[0] // ..........................................................................................................................*.................. - // mul v30.4S, v4.4S, v27.S[0] // ..................................................................................................................................*.......... - // sqrdmulh v4.4S, v4.4S, v27.S[1] // ...........................................................................................................................*................. - // add v21.4S, v21.4S, v17.4S // ................................................................................................................*............................ - // mul v17.4S, v20.4S, v27.S[0] // ...................................................................................................................*......................... - // sqrdmulh v20.4S, v20.4S, v27.S[1] // ..................................................................................................................*.......................... - // mls v30.4S, v4.4S, v8.S[0] // .......................................................................................................................................*..... - // str q0, [x1], #(16*4) // .................................................................................................................................*........... - // mls v17.4S, v20.4S, v8.S[0] // ........................................................................................................................................*.... - // str q16, [x1, #-48] // ...................................................................................................*......................................... - // str q26, [x1, #-32] // ..............................................................................................................................*.............. - // str q21, [x1, #-16] // ...................................................................................................................................*......... - // add x1, x1, #64 // ....................................................................................................................................*........ - // str q2, [x2], #(16*4) // .........................................................................................................................................*... - // str q23, [x2, #-48] // ......................................................................................................................................*...... - // str q30, [x2, #-32] // ..........................................................................................................................................*.. - // str q17, [x2, #-16] // ...........................................................................................................................................*. - // add x2, x2, #64 // ............................................................................................................................................* + // trn1 v0.4S, v30.4S, v24.4S // *.................................................................................................................................................... + // trn2 v10.4S, v30.4S, v24.4S // .*................................................................................................................................................... + // trn2 v30.4S, v7.4S, v1.4S // ..*.................................................................................................................................................. + // trn2 v24.2D, v0.2D, v6.2D // ...*................................................................................................................................................. + // trn1 v0.2D, v0.2D, v6.2D // ....*................................................................................................................................................ + // trn2 v7.2D, v10.2D, v30.2D // ......*.............................................................................................................................................. + // trn1 v10.2D, v10.2D, v30.2D // .....*............................................................................................................................................... + // sub v30.4S, v24.4S, v7.4S // ........*............................................................................................................................................ + // add v24.4S, v24.4S, v7.4S // .........*........................................................................................................................................... + // sub v7.4S, v0.4S, v10.4S // .......*............................................................................................................................................. + // add v0.4S, v0.4S, v10.4S // ...............*..................................................................................................................................... + // trn1 v10.4S, v13.4S, v12.4S // ...................................*................................................................................................................. + // trn2 v1.4S, v13.4S, v12.4S // ..............*...................................................................................................................................... + // trn1 v13.4S, v17.4S, v27.4S // ....................*................................................................................................................................ + // trn2 v6.4S, v17.4S, v27.4S // ..................*.................................................................................................................................. + // mul v12.4S, v30.4S, v11.4S // ...........*......................................................................................................................................... + // sqrdmulh v30.4S, v30.4S, v19.4S // ............*........................................................................................................................................ + // sub v17.4S, v0.4S, v24.4S // ...................*................................................................................................................................. + // add v0.4S, v0.4S, v24.4S // ...........................*......................................................................................................................... + // mul v24.4S, v7.4S, v28.4S // .............*....................................................................................................................................... + // sqrdmulh v7.4S, v7.4S, v18.4S // ..........*.......................................................................................................................................... + // trn2 v28.2D, v10.2D, v13.2D // ..............................................*...................................................................................................... + // trn2 v27.2D, v1.2D, v6.2D // ...............................*..................................................................................................................... + // trn1 v10.2D, v10.2D, v13.2D // ...........................................*......................................................................................................... + // trn1 v1.2D, v1.2D, v6.2D // ..........................................*.......................................................................................................... + // mls v24.4S, v7.4S, v8.S[0] // .................*................................................................................................................................... + // mls v12.4S, v30.4S, v8.S[0] // ................*.................................................................................................................................... + // mul v30.4S, v17.4S, v15.4S // .......................*............................................................................................................................. + // sqrdmulh v7.4S, v17.4S, v21.4S // ......................*.............................................................................................................................. + // sub v13.4S, v10.4S, v1.4S // .............................................*....................................................................................................... + // sub v6.4S, v24.4S, v12.4S // .....................*............................................................................................................................... + // add v24.4S, v24.4S, v12.4S // ..........................*.......................................................................................................................... + // mls v30.4S, v7.4S, v8.S[0] // ............................*........................................................................................................................ + // mul v7.4S, v6.4S, v15.4S // .........................*........................................................................................................................... + // sqrdmulh v6.4S, v6.4S, v21.4S // ........................*............................................................................................................................ + // add v10.4S, v10.4S, v1.4S // ..........................................................*.......................................................................................... + // mul v1.4S, v13.4S, v25.4S // .................................................*................................................................................................... + // trn1 v12.4S, v0.4S, v24.4S // ..............................*...................................................................................................................... + // trn2 v0.4S, v0.4S, v24.4S // ................................*.................................................................................................................... + // mls v7.4S, v6.4S, v8.S[0] // .............................*....................................................................................................................... + // sqrdmulh v24.4S, v13.4S, v14.4S // ................................................*.................................................................................................... + // sub v13.4S, v28.4S, v27.4S // ...................................................*................................................................................................. + // add v6.4S, v28.4S, v27.4S // .........................................................*........................................................................................... + // trn1 v17.4S, v30.4S, v7.4S // ..................................*.................................................................................................................. + // mls v1.4S, v24.4S, v8.S[0] // .....................................................*............................................................................................... + // mul v24.4S, v13.4S, v5.4S // ......................................................*.............................................................................................. + // sqrdmulh v13.4S, v13.4S, v29.4S // .......................................................*............................................................................................. + // sub v28.4S, v10.4S, v6.4S // .............................................................*....................................................................................... + // add v10.4S, v10.4S, v6.4S // ..............................................................*...................................................................................... + // trn2 v30.4S, v30.4S, v7.4S // .................................*................................................................................................................... + // mls v24.4S, v13.4S, v8.S[0] // ...........................................................*......................................................................................... + // mul v7.4S, v28.4S, v2.4S // .................................................................*................................................................................... + // sqrdmulh v13.4S, v28.4S, v4.4S // ................................................................*.................................................................................... + // trn2 v6.2D, v12.2D, v17.2D // ......................................*.............................................................................................................. + // sub v28.4S, v1.4S, v24.4S // ...............................................................*..................................................................................... + // add v24.4S, v1.4S, v24.4S // .....................................................................*............................................................................... + // mls v7.4S, v13.4S, v8.S[0] // ......................................................................*.............................................................................. + // mul v1.4S, v28.4S, v2.4S // ...................................................................*................................................................................. + // sqrdmulh v13.4S, v28.4S, v4.4S // ..................................................................*.................................................................................. + // trn2 v28.2D, v0.2D, v30.2D // .....................................*............................................................................................................... + // trn1 v12.2D, v12.2D, v17.2D // .......................................*............................................................................................................. + // trn1 v0.2D, v0.2D, v30.2D // ....................................*................................................................................................................ + // mls v1.4S, v13.4S, v8.S[0] // .......................................................................*............................................................................. + // trn1 v30.4S, v10.4S, v24.4S // ........................................................................*............................................................................ + // trn2 v10.4S, v10.4S, v24.4S // ..........................................................................*.......................................................................... + // sub v24.4S, v12.4S, v0.4S // ....................................................*................................................................................................ + // trn1 v13.4S, v7.4S, v1.4S // ............................................................................*........................................................................ + // trn2 v7.4S, v7.4S, v1.4S // ...........................................................................*......................................................................... + // add v0.4S, v12.4S, v0.4S // .........................................*........................................................................................................... + // trn2 v1.2D, v30.2D, v13.2D // ................................................................................*.................................................................... + // trn2 v12.2D, v10.2D, v7.2D // ...............................................................................*..................................................................... + // trn1 v30.2D, v30.2D, v13.2D // .................................................................................*................................................................... + // trn1 v10.2D, v10.2D, v7.2D // ..............................................................................*...................................................................... + // mul v7.4S, v24.4S, v9.S[2] // ....................................................................................*................................................................ + // sqrdmulh v24.4S, v24.4S, v9.S[3] // ........................................................*............................................................................................ + // sub v13.4S, v6.4S, v28.4S // ............................................................*........................................................................................ + // add v6.4S, v6.4S, v28.4S // ........................................*............................................................................................................ + // sub v17.4S, v30.4S, v10.4S // ...................................................................................*................................................................. + // mls v7.4S, v24.4S, v8.S[0] // ........................................................................................*............................................................ + // mul v24.4S, v13.4S, v31.S[0] // .........................................................................*........................................................................... + // sqrdmulh v13.4S, v13.4S, v31.S[1] // ....................................................................*................................................................................ + // add v10.4S, v30.4S, v10.4S // ..............................................................................................*...................................................... + // mul v30.4S, v17.4S, v31.S[2] // .......................................................................................*............................................................. + // sqrdmulh v17.4S, v17.4S, v31.S[3] // ......................................................................................*.............................................................. + // mls v24.4S, v13.4S, v8.S[0] // .............................................................................*....................................................................... + // sub v13.4S, v1.4S, v12.4S // ..................................................................................*.................................................................. + // add v1.4S, v1.4S, v12.4S // .........................................................................................*........................................................... + // mls v30.4S, v17.4S, v8.S[0] // ...........................................................................................*......................................................... + // mul v12.4S, v13.4S, v3.S[0] // .....................................................................................*............................................................... + // sqrdmulh v13.4S, v13.4S, v3.S[1] // ..........................................................................................*.......................................................... + // sub v17.4S, v0.4S, v6.4S // .............................................................................................*....................................................... + // add v0.4S, v0.4S, v6.4S // ............................................*........................................................................................................ + // sub v6.4S, v7.4S, v24.4S // ............................................................................................*........................................................ + // mls v12.4S, v13.4S, v8.S[0] // ...............................................................................................*..................................................... + // mul v13.4S, v17.4S, v23.S[2] // ................................................................................................*.................................................... + // sqrdmulh v17.4S, v17.4S, v23.S[3] // .................................................................................................*................................................... + // add v24.4S, v7.4S, v24.4S // .....................................................................................................*............................................... + // mul v7.4S, v6.4S, v23.S[2] // ....................................................................................................*................................................ + // sqrdmulh v6.4S, v6.4S, v23.S[3] // ..................................................................................................*.................................................. + // mls v13.4S, v17.4S, v8.S[0] // .......................................................................................................*............................................. + // sub v17.4S, v10.4S, v1.4S // .....................................................................................................................*............................... + // add v10.4S, v10.4S, v1.4S // ............................................................................................................*........................................ + // mls v7.4S, v6.4S, v8.S[0] // ........................................................................................................*............................................ + // mul v1.4S, v17.4S, v9.S[0] // ...............................................................................................................................*..................... + // sqrdmulh v6.4S, v17.4S, v9.S[1] // ................................................................................................................................*.................... + // sub v17.4S, v30.4S, v12.4S // ...................................................................................................*................................................. + // add v30.4S, v30.4S, v12.4S // ..........................................................................................................*.......................................... + // srshr v12.4S, v0.4S, #23 // ...............................................*..................................................................................................... + // mls v1.4S, v6.4S, v8.S[0] // ....................................................................................................................................*................ + // mul v6.4S, v17.4S, v9.S[0] // .........................................................................................................*........................................... + // sqrdmulh v17.4S, v17.4S, v9.S[1] // ......................................................................................................*.............................................. + // mls v0.4S, v12.4S, v8.4S // ..................................................*.................................................................................................. + // srshr v12.4S, v24.4S, #23 // ...........................................................................................................*......................................... + // srshr v28.4S, v10.4S, #23 // ................................................................................................................*.................................... + // mls v6.4S, v17.4S, v8.S[0] // .............................................................................................................*....................................... + // mls v24.4S, v12.4S, v8.4S // ..............................................................................................................*...................................... + // mls v10.4S, v28.4S, v8.4S // ...................................................................................................................*................................. + // srshr v12.4S, v30.4S, #23 // ...............................................................................................................*..................................... + // sub v17.4S, v13.4S, v1.4S // ........................................................................................................................................*............ + // add v1.4S, v13.4S, v1.4S // ..........................................................................................................................................*.......... + // mls v30.4S, v12.4S, v8.4S // ..................................................................................................................*.................................. + // sub v13.4S, v0.4S, v10.4S // ........................................................................................................................*............................ + // add v0.4S, v0.4S, v10.4S // .....................................................................................................................................*............... + // mul v10.4S, v17.4S, v23.S[0] // ............................................................................................................................................*........ + // mul v12.4S, v13.4S, v23.S[0] // .............................................................................................................................*....................... + // sqrdmulh v13.4S, v13.4S, v23.S[1] // ...........................................................................................................................*......................... + // sub v28.4S, v24.4S, v30.4S // ......................................................................................................................*.............................. + // add v30.4S, v24.4S, v30.4S // .......................................................................................................................*............................. + // sqrdmulh v24.4S, v17.4S, v23.S[1] // ...........................................................................................................................................*......... + // mls v12.4S, v13.4S, v8.S[0] // .................................................................................................................................*................... + // mul v13.4S, v28.4S, v23.S[0] // ..........................................................................................................................*.......................... + // sqrdmulh v17.4S, v28.4S, v23.S[1] // .........................................................................................................................*........................... + // mls v10.4S, v24.4S, v8.S[0] // .................................................................................................................................................*... + // sub v24.4S, v7.4S, v6.4S // ...................................................................................................................................*................. + // add v7.4S, v7.4S, v6.4S // .................................................................................................................*................................... + // mls v13.4S, v17.4S, v8.S[0] // ..............................................................................................................................*...................... + // mul v6.4S, v24.4S, v23.S[0] // .........................................................................................................................................*........... + // sqrdmulh v24.4S, v24.4S, v23.S[1] // .......................................................................................................................................*............. + // str q0, [x1], #(16*4) // ...............................................................................................................................................*..... + // mls v6.4S, v24.4S, v8.S[0] // ..............................................................................................................................................*...... + // str q30, [x1, #-48] // ............................................................................................................................*........................ + // str q1, [x1, #-32] // .............................................................................................................................................*....... + // str q7, [x1, #-16] // ....................................................................................................................*................................ + // add x1, x1, #64 // ................................................................................................................................................*.... + // str q12, [x2], #(16*4) // ......................................................................................................................................*.............. + // str q13, [x2, #-48] // ..................................................................................................................................*.................. + // str q10, [x2, #-32] // ...................................................................................................................................................*. + // str q6, [x2, #-16] // ..................................................................................................................................................*.. + // add x2, x2, #64 // ....................................................................................................................................................* // ----------------------------------------------------------------------------- ninv .req v25 ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 ASM_LOAD(xtmp, ninv_addr) ld1r {ninv.4s}, [xtmp] ASM_LOAD(xtmp, ninv_tw_addr) ld1r {ninv_tw.4s}, [xtmp] + ushr modulus_half.4S, consts.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + mov count, #8 ASM_LOAD(r_ptr0, roots_l012) load_roots_123 .p2align 2 - ldr q13, [x0, #768] // .....*...... + ldr q12, [x0, #256] // ..*......... // gap // ............ // gap // ............ // gap // ............ - ldr q7, [x0, #896] // .......*.... + ldr q21, [x0, #384] // ........*... // gap // ............ // gap // ............ // gap // ............ - ldr q4, [x0, #512] // ...*........ + ldr q15, [x0, #896] // .......*.... // gap // ............ // gap // ............ // gap // ............ - ldr q11, [x0, #640] // ....*....... + ldr q17, [x0, #512] // ...*........ // gap // ............ // gap // ............ // gap // ............ - ldr q20, [x0, #384] // ........*... + ldr q27, [x0, #768] // .....*...... // gap // ............ // gap // ............ // gap // ............ - ldr q21, [x0, #256] // ..*......... + ldr q28, [x0, #640] // ....*....... // gap // ............ // gap // ............ // gap // ............ - add v19.4S, v4.4S, v11.4S // ......*..... + add v19.4S, v12.4S, v21.4S // ..........*. // gap // ............ - add v6.4S, v13.4S, v7.4S // .........*.. + add v11.4S, v27.4S, v15.4S // .........*.. // gap // ............ - ldr q17, [x0, #0] // *........... + add v18.4S, v17.4S, v28.4S // ......*..... // gap // ............ + ldr q7, [x0, #0] // *........... // gap // ............ // gap // ............ - add v30.4S, v21.4S, v20.4S // ..........*. // gap // ............ - add v10.4S, v19.4S, v6.4S // ...........* + add v4.4S, v18.4S, v11.4S // ...........* // gap // ............ - ldr q23, [x0, #128] // .*.......... + ldr q6, [x0, #128] // .*.......... // gap // ............ // original source code - // ldr q17, [x0, #0] // ........*... - // ldr q23, [x0, #128] // ...........* - // ldr q21, [x0, #256] // .....*...... - // ldr q4, [x0, #512] // ..*......... - // ldr q11, [x0, #640] // ...*........ - // ldr q13, [x0, #768] // *........... - // add v19.4S, v4.4S, v11.4S // ......*..... - // ldr q7, [x0, #896] // .*.......... - // ldr q20, [x0, #384] // ....*....... - // add v6.4S, v13.4S, v7.4S // .......*.... - // add v30.4S, v21.4S, v20.4S // .........*.. - // add v10.4S, v19.4S, v6.4S // ..........*. + // ldr q7, [x0, #0] // .........*.. + // ldr q6, [x0, #128] // ...........* + // ldr q12, [x0, #256] // *........... + // ldr q17, [x0, #512] // ...*........ + // ldr q28, [x0, #640] // .....*...... + // ldr q27, [x0, #768] // ....*....... + // add v18.4S, v17.4S, v28.4S // ........*... + // ldr q15, [x0, #896] // ..*......... + // ldr q21, [x0, #384] // .*.......... + // add v11.4S, v27.4S, v15.4S // .......*.... + // add v19.4S, v12.4S, v21.4S // ......*..... + // add v4.4S, v18.4S, v11.4S // ..........*. sub count, count, #1 layer123_start: - sub v16.4S, v17.4S, v23.4S // ........*....................................................................................... - // gap // ................................................................................................ - add v23.4S, v17.4S, v23.4S // .........*...................................................................................... - // gap // ................................................................................................ - sub v21.4S, v21.4S, v20.4S // .............*.................................................................................. - // gap // ................................................................................................ - mul v20.4S, v16.4S, v1.S[2] // ..........*..................................................................................... - // gap // ................................................................................................ - sqrdmulh v16.4S, v16.4S, v1.S[3] // ...........*.................................................................................... - // gap // ................................................................................................ - sub v17.4S, v23.4S, v30.4S // ............................*................................................................... - // gap // ................................................................................................ - add v23.4S, v23.4S, v30.4S // .............................*.................................................................. - // gap // ................................................................................................ - mul v30.4S, v21.4S, v2.S[0] // ...............*................................................................................ - // gap // ................................................................................................ - sqrdmulh v21.4S, v21.4S, v2.S[1] // ................*............................................................................... - // gap // ................................................................................................ - mls v20.4S, v16.4S, v8.S[0] // ............*................................................................................... - // gap // ................................................................................................ - sub v16.4S, v4.4S, v11.4S // ..................*............................................................................. - // gap // ................................................................................................ - mul v4.4S, v17.4S, v0.S[2] // ..............................*................................................................. - // gap // ................................................................................................ - sqrdmulh v17.4S, v17.4S, v0.S[3] // ...............................*................................................................ - // gap // ................................................................................................ - sub v11.4S, v23.4S, v10.4S // ................................................*............................................... - // gap // ................................................................................................ - add v23.4S, v23.4S, v10.4S // .................................................*.............................................. - // gap // ................................................................................................ - mls v30.4S, v21.4S, v8.S[0] // .................*.............................................................................. - // gap // ................................................................................................ - mul v21.4S, v16.4S, v2.S[2] // ....................*........................................................................... - // gap // ................................................................................................ - sqrdmulh v16.4S, v16.4S, v2.S[3] // .....................*.......................................................................... - // gap // ................................................................................................ - sub v13.4S, v13.4S, v7.4S // .......................*........................................................................ - // gap // ................................................................................................ - sub v7.4S, v20.4S, v30.4S // .................................*.............................................................. - // gap // ................................................................................................ - add v20.4S, v20.4S, v30.4S // ..................................*............................................................. - // gap // ................................................................................................ - mls v21.4S, v16.4S, v8.S[0] // ......................*......................................................................... - // gap // ................................................................................................ - mul v16.4S, v13.4S, v3.S[0] // .........................*...................................................................... - // gap // ................................................................................................ - mls v4.4S, v17.4S, v8.S[0] // ................................*............................................................... - // gap // ................................................................................................ - sqrdmulh v17.4S, v13.4S, v3.S[1] // ..........................*..................................................................... - // gap // ................................................................................................ - mul v30.4S, v7.4S, v0.S[2] // ...................................*............................................................ - // gap // ................................................................................................ - sqrdmulh v13.4S, v7.4S, v0.S[3] // ....................................*........................................................... - // gap // ................................................................................................ - mul v7.4S, v11.4S, v0.S[0] // ..................................................*............................................. - // gap // ................................................................................................ - sqrdmulh v11.4S, v11.4S, v0.S[1] // ...................................................*............................................ - // gap // ................................................................................................ - mul v10.4S, v23.4S, v25.4S // ................................................................................*............... - // gap // ................................................................................................ - sqrdmulh v23.4S, v23.4S, v26.4S // .................................................................................*.............. - // gap // ................................................................................................ - mls v16.4S, v17.4S, v8.S[0] // ...........................*.................................................................... - // gap // ................................................................................................ - mls v30.4S, v13.4S, v8.S[0] // .....................................*.......................................................... - // gap // ................................................................................................ - sub v17.4S, v19.4S, v6.4S // ......................................*......................................................... - // gap // ................................................................................................ - mls v7.4S, v11.4S, v8.S[0] // ....................................................*........................................... - // gap // ................................................................................................ - sub v11.4S, v21.4S, v16.4S // ...........................................*.................................................... - // gap // ................................................................................................ - mul v13.4S, v17.4S, v1.S[0] // ........................................*....................................................... - // gap // ................................................................................................ - sqrdmulh v17.4S, v17.4S, v1.S[1] // .........................................*...................................................... - // gap // ................................................................................................ - add v16.4S, v21.4S, v16.4S // ............................................*................................................... - // gap // ................................................................................................ - mul v21.4S, v11.4S, v1.S[0] // .............................................*.................................................. - // gap // ................................................................................................ - sqrdmulh v11.4S, v11.4S, v1.S[1] // ..............................................*................................................. - // gap // ................................................................................................ - sub v19.4S, v20.4S, v16.4S // .....................................................*.......................................... - // gap // ................................................................................................ - add v16.4S, v20.4S, v16.4S // ......................................................*......................................... - // gap // ................................................................................................ - mls v13.4S, v17.4S, v8.S[0] // ..........................................*..................................................... - // gap // ................................................................................................ - mls v21.4S, v11.4S, v8.S[0] // ...............................................*................................................ - // gap // ................................................................................................ - mul v20.4S, v19.4S, v0.S[0] // .......................................................*........................................ - // gap // ................................................................................................ - sqrdmulh v17.4S, v19.4S, v0.S[1] // ........................................................*....................................... - // gap // ................................................................................................ - sub v11.4S, v4.4S, v13.4S // ..........................................................*..................................... - // gap // ................................................................................................ - add v4.4S, v4.4S, v13.4S // ...........................................................*.................................... - // gap // ................................................................................................ - sub v13.4S, v30.4S, v21.4S // ...............................................................*................................ - // gap // ................................................................................................ - mls v20.4S, v17.4S, v8.S[0] // .........................................................*...................................... - // gap // ................................................................................................ - mul v17.4S, v11.4S, v0.S[0] // ............................................................*................................... - // gap // ................................................................................................ - sqrdmulh v11.4S, v11.4S, v0.S[1] // .............................................................*.................................. - // gap // ................................................................................................ - add v21.4S, v30.4S, v21.4S // ................................................................*............................... - // gap // ................................................................................................ - mul v30.4S, v13.4S, v0.S[0] // .................................................................*.............................. - // gap // ................................................................................................ - sqrdmulh v13.4S, v13.4S, v0.S[1] // ..................................................................*............................. - // gap // ................................................................................................ - mls v17.4S, v11.4S, v8.S[0] // ..............................................................*................................. - // gap // ................................................................................................ - srshr v11.4S, v7.4S, #23 // ....................................................................*........................... - // gap // ................................................................................................ - srshr v19.4S, v20.4S, #23 // ......................................................................*......................... - // gap // ................................................................................................ - mls v10.4S, v23.4S, v8.S[0] // ..................................................................................*............. - // gap // ................................................................................................ - mls v30.4S, v13.4S, v8.S[0] // ...................................................................*............................ - // gap // ................................................................................................ - mls v7.4S, v11.4S, v8.4S // .....................................................................*.......................... - // gap // ................................................................................................ - mls v20.4S, v19.4S, v8.4S // .......................................................................*........................ - // gap // ................................................................................................ - srshr v23.4S, v17.4S, #23 // ........................................................................*....................... - // gap // ................................................................................................ - srshr v11.4S, v30.4S, #23 // ..........................................................................*..................... - // gap // ................................................................................................ - str q7, [x0, #512] // ............................................................................*................... - // gap // ................................................................................................ - mls v17.4S, v23.4S, v8.4S // .........................................................................*...................... - // gap // ................................................................................................ - mls v30.4S, v11.4S, v8.4S // ...........................................................................*.................... - // gap // ................................................................................................ - str q20, [x0, #640] // .............................................................................*.................. - // gap // ................................................................................................ - mul v23.4S, v16.4S, v25.4S // ...................................................................................*............ - // gap // ................................................................................................ - str q17, [x0, #768] // ..............................................................................*................. - // gap // ................................................................................................ - sqrdmulh v16.4S, v16.4S, v26.4S // ....................................................................................*........... - // gap // ................................................................................................ - str q30, [x0, #896] // ...............................................................................*................ - // gap // ................................................................................................ - mul v20.4S, v4.4S, v25.4S // ......................................................................................*......... - // gap // ................................................................................................ - sqrdmulh v17.4S, v4.4S, v26.4S // .......................................................................................*........ - // gap // ................................................................................................ - mls v23.4S, v16.4S, v8.S[0] // .....................................................................................*.......... - // gap // ................................................................................................ - mul v16.4S, v21.4S, v25.4S // .........................................................................................*...... - // gap // ................................................................................................ - sqrdmulh v21.4S, v21.4S, v26.4S // ..........................................................................................*..... - // gap // ................................................................................................ - mls v20.4S, v17.4S, v8.S[0] // ........................................................................................*....... - // gap // ................................................................................................ - str q10, [x0], #(16) // ............................................................................................*... - // gap // ................................................................................................ - ldr q17, [x0, #0] // e............................................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v16.4S, v21.4S, v8.S[0] // ...........................................................................................*.... - // gap // ................................................................................................ - str q23, [x0, #112] // .............................................................................................*.. - // gap // ................................................................................................ - ldr q23, [x0, #128] // .e.............................................................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - str q20, [x0, #240] // ..............................................................................................*. - // gap // ................................................................................................ - ldr q21, [x0, #256] // ..e............................................................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - ldr q4, [x0, #512] // ....e........................................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - ldr q11, [x0, #640] // .....e.......................................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - ldr q13, [x0, #768] // ......e......................................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - add v19.4S, v4.4S, v11.4S // ...................e............................................................................ - // gap // ................................................................................................ - ldr q7, [x0, #896] // .......e........................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - ldr q20, [x0, #384] // ...e............................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - add v6.4S, v13.4S, v7.4S // ........................e....................................................................... - // gap // ................................................................................................ - str q16, [x0, #368] // ...............................................................................................* - // gap // ................................................................................................ - add v30.4S, v21.4S, v20.4S // ..............e................................................................................. - // gap // ................................................................................................ - add v10.4S, v19.4S, v6.4S // .......................................e........................................................ - // gap // ................................................................................................ + sub v10.4S, v7.4S, v6.4S // ........*............................................................................................................... + // gap // ........................................................................................................................ + add v24.4S, v7.4S, v6.4S // .........*.............................................................................................................. + // gap // ........................................................................................................................ + sub v7.4S, v12.4S, v21.4S // .............*.......................................................................................................... + // gap // ........................................................................................................................ + mul v13.4S, v10.4S, v1.S[2] // ..........*............................................................................................................. + // gap // ........................................................................................................................ + sqrdmulh v10.4S, v10.4S, v1.S[3] // ...........*............................................................................................................ + // gap // ........................................................................................................................ + sub v6.4S, v24.4S, v19.4S // ............................*........................................................................................... + // gap // ........................................................................................................................ + add v24.4S, v24.4S, v19.4S // .............................*.......................................................................................... + // gap // ........................................................................................................................ + mul v12.4S, v7.4S, v2.S[0] // ...............*........................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v7.4S, v7.4S, v2.S[1] // ................*....................................................................................................... + // gap // ........................................................................................................................ + mls v13.4S, v10.4S, v8.S[0] // ............*........................................................................................................... + // gap // ........................................................................................................................ + sub v10.4S, v17.4S, v28.4S // ..................*..................................................................................................... + // gap // ........................................................................................................................ + mul v17.4S, v6.4S, v0.S[2] // ..............................*......................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v6.4S, v6.4S, v0.S[3] // ...............................*........................................................................................ + // gap // ........................................................................................................................ + sub v28.4S, v24.4S, v4.4S // ................................................*....................................................................... + // gap // ........................................................................................................................ + add v24.4S, v24.4S, v4.4S // .................................................*...................................................................... + // gap // ........................................................................................................................ + mls v12.4S, v7.4S, v8.S[0] // .................*...................................................................................................... + // gap // ........................................................................................................................ + mul v7.4S, v10.4S, v2.S[2] // ....................*................................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v10.4S, v10.4S, v2.S[3] // .....................*.................................................................................................. + // gap // ........................................................................................................................ + sub v27.4S, v27.4S, v15.4S // .......................*................................................................................................ + // gap // ........................................................................................................................ + sub v15.4S, v13.4S, v12.4S // .................................*...................................................................................... + // gap // ........................................................................................................................ + add v13.4S, v13.4S, v12.4S // ..................................*..................................................................................... + // gap // ........................................................................................................................ + mls v7.4S, v10.4S, v8.S[0] // ......................*................................................................................................. + // gap // ........................................................................................................................ + mul v10.4S, v27.4S, v3.S[0] // .........................*.............................................................................................. + // gap // ........................................................................................................................ + mls v17.4S, v6.4S, v8.S[0] // ................................*....................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v6.4S, v27.4S, v3.S[1] // ..........................*............................................................................................. + // gap // ........................................................................................................................ + mul v12.4S, v15.4S, v0.S[2] // ...................................*.................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v27.4S, v15.4S, v0.S[3] // ....................................*................................................................................... + // gap // ........................................................................................................................ + mul v15.4S, v28.4S, v0.S[0] // ..................................................*..................................................................... + // gap // ........................................................................................................................ + sqrdmulh v28.4S, v28.4S, v0.S[1] // ...................................................*.................................................................... + // gap // ........................................................................................................................ + mul v21.4S, v24.4S, v25.4S // ........................................................................................*............................... + // gap // ........................................................................................................................ + sqrdmulh v24.4S, v24.4S, v26.4S // .........................................................................................*.............................. + // gap // ........................................................................................................................ + mls v10.4S, v6.4S, v8.S[0] // ...........................*............................................................................................ + // gap // ........................................................................................................................ + mls v12.4S, v27.4S, v8.S[0] // .....................................*.................................................................................. + // gap // ........................................................................................................................ + sub v6.4S, v18.4S, v11.4S // ......................................*................................................................................. + // gap // ........................................................................................................................ + mls v15.4S, v28.4S, v8.S[0] // ....................................................*................................................................... + // gap // ........................................................................................................................ + sub v28.4S, v7.4S, v10.4S // ...........................................*............................................................................ + // gap // ........................................................................................................................ + mul v27.4S, v6.4S, v1.S[0] // ........................................*............................................................................... + // gap // ........................................................................................................................ + sqrdmulh v6.4S, v6.4S, v1.S[1] // .........................................*.............................................................................. + // gap // ........................................................................................................................ + add v10.4S, v7.4S, v10.4S // ............................................*........................................................................... + // gap // ........................................................................................................................ + mul v7.4S, v28.4S, v1.S[0] // .............................................*.......................................................................... + // gap // ........................................................................................................................ + sqrdmulh v28.4S, v28.4S, v1.S[1] // ..............................................*......................................................................... + // gap // ........................................................................................................................ + sub v18.4S, v13.4S, v10.4S // .....................................................*.................................................................. + // gap // ........................................................................................................................ + add v10.4S, v13.4S, v10.4S // ......................................................*................................................................. + // gap // ........................................................................................................................ + mls v27.4S, v6.4S, v8.S[0] // ..........................................*............................................................................. + // gap // ........................................................................................................................ + mls v7.4S, v28.4S, v8.S[0] // ...............................................*........................................................................ + // gap // ........................................................................................................................ + mul v13.4S, v18.4S, v0.S[0] // .......................................................*................................................................ + // gap // ........................................................................................................................ + sqrdmulh v6.4S, v18.4S, v0.S[1] // ........................................................*............................................................... + // gap // ........................................................................................................................ + sub v28.4S, v17.4S, v27.4S // ..........................................................*............................................................. + // gap // ........................................................................................................................ + add v17.4S, v17.4S, v27.4S // ...........................................................*............................................................ + // gap // ........................................................................................................................ + sub v27.4S, v12.4S, v7.4S // ...............................................................*........................................................ + // gap // ........................................................................................................................ + mls v13.4S, v6.4S, v8.S[0] // .........................................................*.............................................................. + // gap // ........................................................................................................................ + mul v6.4S, v28.4S, v0.S[0] // ............................................................*........................................................... + // gap // ........................................................................................................................ + sqrdmulh v28.4S, v28.4S, v0.S[1] // .............................................................*.......................................................... + // gap // ........................................................................................................................ + add v7.4S, v12.4S, v7.4S // ................................................................*....................................................... + // gap // ........................................................................................................................ + mul v12.4S, v27.4S, v0.S[0] // .................................................................*...................................................... + // gap // ........................................................................................................................ + sqrdmulh v27.4S, v27.4S, v0.S[1] // ..................................................................*..................................................... + // gap // ........................................................................................................................ + mls v6.4S, v28.4S, v8.S[0] // ..............................................................*......................................................... + // gap // ........................................................................................................................ + cmge v28.4S, v31.4S, v15.4S // ....................................................................*................................................... + // gap // ........................................................................................................................ + cmge v18.4S, v15.4S, v30.4S // .....................................................................*.................................................. + // gap // ........................................................................................................................ + mls v21.4S, v24.4S, v8.S[0] // ..........................................................................................*............................. + // gap // ........................................................................................................................ + mls v12.4S, v27.4S, v8.S[0] // ...................................................................*.................................................... + // gap // ........................................................................................................................ + sub v24.4S, v28.4S, v18.4S // ......................................................................*................................................. + // gap // ........................................................................................................................ + cmge v28.4S, v31.4S, v13.4S // ........................................................................*............................................... + // gap // ........................................................................................................................ + cmge v27.4S, v13.4S, v30.4S // .........................................................................*.............................................. + // gap // ........................................................................................................................ + mls v15.4S, v24.4S, v8.4S // .......................................................................*................................................ + // gap // ........................................................................................................................ + sub v24.4S, v28.4S, v27.4S // ..........................................................................*............................................. + // gap // ........................................................................................................................ + cmge v28.4S, v31.4S, v6.4S // ............................................................................*........................................... + // gap // ........................................................................................................................ + cmge v27.4S, v6.4S, v30.4S // .............................................................................*.......................................... + // gap // ........................................................................................................................ + mls v13.4S, v24.4S, v8.4S // ...........................................................................*............................................ + // gap // ........................................................................................................................ + sub v24.4S, v28.4S, v27.4S // ..............................................................................*......................................... + // gap // ........................................................................................................................ + cmge v28.4S, v31.4S, v12.4S // ................................................................................*....................................... + // gap // ........................................................................................................................ + cmge v27.4S, v12.4S, v30.4S // .................................................................................*...................................... + // gap // ........................................................................................................................ + mls v6.4S, v24.4S, v8.4S // ...............................................................................*........................................ + // gap // ........................................................................................................................ + sub v24.4S, v28.4S, v27.4S // ..................................................................................*..................................... + // gap // ........................................................................................................................ + str q15, [x0, #512] // ....................................................................................*................................... + // gap // ........................................................................................................................ + mul v28.4S, v10.4S, v25.4S // ...........................................................................................*............................ + // gap // ........................................................................................................................ + mls v12.4S, v24.4S, v8.4S // ...................................................................................*.................................... + // gap // ........................................................................................................................ + str q13, [x0, #640] // .....................................................................................*.................................. + // gap // ........................................................................................................................ + sqrdmulh v10.4S, v10.4S, v26.4S // ............................................................................................*........................... + // gap // ........................................................................................................................ + str q6, [x0, #768] // ......................................................................................*................................. + // gap // ........................................................................................................................ + mul v24.4S, v17.4S, v25.4S // ..............................................................................................*......................... + // gap // ........................................................................................................................ + str q12, [x0, #896] // .......................................................................................*................................ + // gap // ........................................................................................................................ + mls v28.4S, v10.4S, v8.S[0] // .............................................................................................*.......................... + // gap // ........................................................................................................................ + sqrdmulh v10.4S, v17.4S, v26.4S // ...............................................................................................*........................ + // gap // ........................................................................................................................ + mul v13.4S, v7.4S, v25.4S // .................................................................................................*...................... + // gap // ........................................................................................................................ + sqrdmulh v7.4S, v7.4S, v26.4S // ..................................................................................................*..................... + // gap // ........................................................................................................................ + cmge v6.4S, v31.4S, v21.4S // ....................................................................................................*................... + // gap // ........................................................................................................................ + mls v24.4S, v10.4S, v8.S[0] // ................................................................................................*....................... + // gap // ........................................................................................................................ + cmge v10.4S, v21.4S, v30.4S // .....................................................................................................*.................. + // gap // ........................................................................................................................ + mls v13.4S, v7.4S, v8.S[0] // ...................................................................................................*.................... + // gap // ........................................................................................................................ + sub v10.4S, v6.4S, v10.4S // ......................................................................................................*................. + // gap // ........................................................................................................................ + cmge v7.4S, v31.4S, v28.4S // ........................................................................................................*............... + // gap // ........................................................................................................................ + cmge v6.4S, v28.4S, v30.4S // .........................................................................................................*.............. + // gap // ........................................................................................................................ + mls v21.4S, v10.4S, v8.4S // .......................................................................................................*................ + // gap // ........................................................................................................................ + sub v10.4S, v7.4S, v6.4S // ..........................................................................................................*............. + // gap // ........................................................................................................................ + cmge v7.4S, v31.4S, v24.4S // ............................................................................................................*........... + // gap // ........................................................................................................................ + cmge v6.4S, v24.4S, v30.4S // .............................................................................................................*.......... + // gap // ........................................................................................................................ + mls v28.4S, v10.4S, v8.4S // ...........................................................................................................*............ + // gap // ........................................................................................................................ + sub v10.4S, v7.4S, v6.4S // ..............................................................................................................*......... + // gap // ........................................................................................................................ + cmge v7.4S, v31.4S, v13.4S // ................................................................................................................*....... + // gap // ........................................................................................................................ + cmge v6.4S, v13.4S, v30.4S // .................................................................................................................*...... + // gap // ........................................................................................................................ + mls v24.4S, v10.4S, v8.4S // ...............................................................................................................*........ + // gap // ........................................................................................................................ + sub v10.4S, v7.4S, v6.4S // ..................................................................................................................*..... + // gap // ........................................................................................................................ + str q21, [x0], #(16) // ....................................................................................................................*... + // gap // ........................................................................................................................ + ldr q7, [x0, #0] // e....................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v13.4S, v10.4S, v8.4S // ...................................................................................................................*.... + // gap // ........................................................................................................................ + str q28, [x0, #112] // .....................................................................................................................*.. + // gap // ........................................................................................................................ + ldr q6, [x0, #128] // .e...................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q24, [x0, #240] // ......................................................................................................................*. + // gap // ........................................................................................................................ + ldr q12, [x0, #256] // ..e..................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q17, [x0, #512] // ....e................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q28, [x0, #640] // .....e.................................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q27, [x0, #768] // ......e................................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v18.4S, v17.4S, v28.4S // ...................e.................................................................................................... + // gap // ........................................................................................................................ + ldr q15, [x0, #896] // .......e................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q21, [x0, #384] // ...e.................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v11.4S, v27.4S, v15.4S // ........................e............................................................................................... + // gap // ........................................................................................................................ + str q13, [x0, #368] // .......................................................................................................................* + // gap // ........................................................................................................................ + add v19.4S, v12.4S, v21.4S // ..............e......................................................................................................... + // gap // ........................................................................................................................ + add v4.4S, v18.4S, v11.4S // .......................................e................................................................................ + // gap // ........................................................................................................................ // original source code - // ldr q9, [x0, #0] // e...............|...............................................................................e............. - // ldr q10, [x0, #(1*(1024/8))] // ...e............|..................................................................................e.......... - // ldr q11, [x0, #(2*(1024/8))] // .....e..........|....................................................................................e........ - // ldr q12, [x0, #(3*(1024/8))] // ...........e....|..........................................................................................e.. - // ldr q13, [x0, #(4*(1024/8))] // ......e.........|.....................................................................................e....... - // ldr q14, [x0, #(5*(1024/8))] // .......e........|......................................................................................e...... - // ldr q15, [x0, #(6*(1024/8))] // ........e.......|.......................................................................................e..... - // ldr q16, [x0, #(7*(1024/8))] // ..........e.....|.........................................................................................e... - // sub v24.4s, v9.4s, v10.4s // ................*............................................................................................. - // add v9.4s, v9.4s, v10.4s // ................|*............................................................................................ - // mul v10.4s, v24.4s, v1.s[2] // ................|..*.......................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ................|...*......................................................................................... - // mls v10.4s, v24.4s, v8.s[0] // ................|........*.................................................................................... - // sub v24.4s, v11.4s, v12.4s // ................|.*........................................................................................... - // add v11.4s, v11.4s, v12.4s // ..............e.|............................................................................................. - // mul v12.4s, v24.4s, v2.s[0] // ................|......*...................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................|.......*..................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ................|..............*.............................................................................. - // sub v24.4s, v13.4s, v14.4s // ................|.........*................................................................................... - // add v13.4s, v13.4s, v14.4s // .........e......|........................................................................................e.... - // mul v14.4s, v24.4s, v2.s[2] // ................|...............*............................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................|................*............................................................................ - // mls v14.4s, v24.4s, v8.s[0] // ................|....................*........................................................................ - // sub v24.4s, v15.4s, v16.4s // ................|.................*........................................................................... - // add v15.4s, v15.4s, v16.4s // ............e...|...........................................................................................e. - // mul v16.4s, v24.4s, v3.s[0] // ................|.....................*....................................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................|.......................*..................................................................... - // mls v16.4s, v24.4s, v8.s[0] // ................|..............................*.............................................................. - // sub v24.4s, v9.4s, v11.4s // ................|....*........................................................................................ - // add v9.4s, v9.4s, v11.4s // ................|.....*....................................................................................... - // mul v11.4s, v24.4s, v0.s[2] // ................|..........*.................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|...........*................................................................................. - // mls v11.4s, v24.4s, v8.s[0] // ................|......................*...................................................................... - // sub v24.4s, v10.4s, v12.4s // ................|..................*.......................................................................... - // add v10.4s, v10.4s, v12.4s // ................|...................*......................................................................... - // mul v12.4s, v24.4s, v0.s[2] // ................|........................*.................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|.........................*................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ................|...............................*............................................................. - // sub v24.4s, v13.4s, v15.4s // ................|................................*............................................................ - // add v13.4s, v13.4s, v15.4s // ...............e|............................................................................................. - // mul v15.4s, v24.4s, v1.s[0] // ................|...................................*......................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|....................................*........................................................ - // mls v15.4s, v24.4s, v8.s[0] // ................|..........................................*.................................................. - // sub v24.4s, v14.4s, v16.4s // ................|..................................*.......................................................... - // add v14.4s, v14.4s, v16.4s // ................|.....................................*....................................................... - // mul v16.4s, v24.4s, v1.s[0] // ................|......................................*...................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|.......................................*..................................................... - // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................*................................................. - // sub v24.4s, v9.4s, v13.4s // ................|............*................................................................................ - // add v9.4s, v9.4s, v13.4s // ................|.............*............................................................................... - // mul v13.4s, v24.4s, v0.s[0] // ................|..........................*.................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...........................*................................................................. - // mls v13.4s, v24.4s, v8.s[0] // ................|.................................*........................................................... - // sub v24.4s, v10.4s, v14.4s // ................|........................................*.................................................... - // add v10.4s, v10.4s, v14.4s // ................|.........................................*................................................... - // mul v14.4s, v24.4s, v0.s[0] // ................|............................................*................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|.............................................*............................................... - // mls v14.4s, v24.4s, v8.s[0] // ................|.................................................*........................................... - // sub v24.4s, v11.4s, v15.4s // ................|..............................................*.............................................. - // add v11.4s, v11.4s, v15.4s // ................|...............................................*............................................. - // mul v15.4s, v24.4s, v0.s[0] // ................|..................................................*.......................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...................................................*......................................... - // mls v15.4s, v24.4s, v8.s[0] // ................|.......................................................*..................................... - // sub v24.4s, v12.4s, v16.4s // ................|................................................*............................................ - // add v12.4s, v12.4s, v16.4s // ................|....................................................*........................................ - // mul v16.4s, v24.4s, v0.s[0] // ................|.....................................................*....................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|......................................................*...................................... - // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................................*................................. - // srshr v24.4S, v13.4S, #23 // ................|........................................................*.................................... - // mls v13.4s, v24.4s, v8.4s // ................|............................................................*................................ - // srshr v24.4S, v14.4S, #23 // ................|.........................................................*................................... - // mls v14.4s, v24.4s, v8.4s // ................|.............................................................*............................... - // srshr v24.4S, v15.4S, #23 // ................|..............................................................*.............................. - // mls v15.4s, v24.4s, v8.4s // ................|.................................................................*........................... - // srshr v24.4S, v16.4S, #23 // ................|...............................................................*............................. - // mls v16.4s, v24.4s, v8.4s // ................|..................................................................*.......................... - // str q13, [x0, #(4*(1024/8))] // ................|................................................................*............................ - // str q14, [x0, #(5*(1024/8))] // ................|...................................................................*......................... - // str q15, [x0, #(6*(1024/8))] // ................|.....................................................................*....................... - // str q16, [x0, #(7*(1024/8))] // ................|.......................................................................*..................... - // mul v13.4s, v9.4s, v25.4s // ................|............................*................................................................ - // sqrdmulh v9.4s, v9.4s, v26.4s // ................|.............................*............................................................... - // mls v13.4s, v9.4s, v8.s[0] // ................|..........................................................*.................................. - // mul v14.4s, v10.4s, v25.4s // ................|....................................................................*........................ - // sqrdmulh v10.4s, v10.4s, v26.4s // ................|......................................................................*...................... - // mls v14.4s, v10.4s, v8.s[0] // ................|..........................................................................*.................. - // mul v15.4s, v11.4s, v25.4s // ................|........................................................................*.................... - // sqrdmulh v11.4s, v11.4s, v26.4s // ................|.........................................................................*................... - // mls v15.4s, v11.4s, v8.s[0] // ................|.............................................................................*............... - // mul v16.4s, v12.4s, v25.4s // ................|...........................................................................*................. - // sqrdmulh v12.4s, v12.4s, v26.4s // ................|............................................................................*................ - // mls v16.4s, v12.4s, v8.s[0] // .*..............|................................................................................*............ - // str q13, [x0], #(16) // ................|..............................................................................*.............. - // str q14, [x0, #(-16 + 1*(1024/8))] // ..*.............|.................................................................................*........... - // str q15, [x0, #(-16 + 2*(1024/8))] // ....*...........|...................................................................................*......... - // str q16, [x0, #(-16 + 3*(1024/8))] // .............*..|............................................................................................* + // ldr q9, [x0, #0] // e...............|.......................................................................................................e............. + // ldr q10, [x0, #(1*(1024/8))] // ...e............|..........................................................................................................e.......... + // ldr q11, [x0, #(2*(1024/8))] // .....e..........|............................................................................................................e........ + // ldr q12, [x0, #(3*(1024/8))] // ...........e....|..................................................................................................................e.. + // ldr q13, [x0, #(4*(1024/8))] // ......e.........|.............................................................................................................e....... + // ldr q14, [x0, #(5*(1024/8))] // .......e........|..............................................................................................................e...... + // ldr q15, [x0, #(6*(1024/8))] // ........e.......|...............................................................................................................e..... + // ldr q16, [x0, #(7*(1024/8))] // ..........e.....|.................................................................................................................e... + // sub v24.4s, v9.4s, v10.4s // ................*..................................................................................................................... + // add v9.4s, v9.4s, v10.4s // ................|*.................................................................................................................... + // mul v10.4s, v24.4s, v1.s[2] // ................|..*.................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ................|...*................................................................................................................. + // mls v10.4s, v24.4s, v8.s[0] // ................|........*............................................................................................................ + // sub v24.4s, v11.4s, v12.4s // ................|.*................................................................................................................... + // add v11.4s, v11.4s, v12.4s // ..............e.|..................................................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ................|......*.............................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................|.......*............................................................................................................. + // mls v12.4s, v24.4s, v8.s[0] // ................|..............*...................................................................................................... + // sub v24.4s, v13.4s, v14.4s // ................|.........*........................................................................................................... + // add v13.4s, v13.4s, v14.4s // .........e......|................................................................................................................e.... + // mul v14.4s, v24.4s, v2.s[2] // ................|...............*..................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................|................*.................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ................|....................*................................................................................................ + // sub v24.4s, v15.4s, v16.4s // ................|.................*................................................................................................... + // add v15.4s, v15.4s, v16.4s // ............e...|...................................................................................................................e. + // mul v16.4s, v24.4s, v3.s[0] // ................|.....................*............................................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................|.......................*............................................................................................. + // mls v16.4s, v24.4s, v8.s[0] // ................|..............................*...................................................................................... + // sub v24.4s, v9.4s, v11.4s // ................|....*................................................................................................................ + // add v9.4s, v9.4s, v11.4s // ................|.....*............................................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ................|..........*.......................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|...........*......................................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ................|......................*.............................................................................................. + // sub v24.4s, v10.4s, v12.4s // ................|..................*.................................................................................................. + // add v10.4s, v10.4s, v12.4s // ................|...................*................................................................................................. + // mul v12.4s, v24.4s, v0.s[2] // ................|........................*............................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|.........................*........................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ................|...............................*..................................................................................... + // sub v24.4s, v13.4s, v15.4s // ................|................................*.................................................................................... + // add v13.4s, v13.4s, v15.4s // ...............e|..................................................................................................................... + // mul v15.4s, v24.4s, v1.s[0] // ................|...................................*................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|....................................*................................................................................ + // mls v15.4s, v24.4s, v8.s[0] // ................|..........................................*.......................................................................... + // sub v24.4s, v14.4s, v16.4s // ................|..................................*.................................................................................. + // add v14.4s, v14.4s, v16.4s // ................|.....................................*............................................................................... + // mul v16.4s, v24.4s, v1.s[0] // ................|......................................*.............................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|.......................................*............................................................................. + // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................*......................................................................... + // sub v24.4s, v9.4s, v13.4s // ................|............*........................................................................................................ + // add v9.4s, v9.4s, v13.4s // ................|.............*....................................................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ................|..........................*.......................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...........................*......................................................................................... + // mls v13.4s, v24.4s, v8.s[0] // ................|.................................*................................................................................... + // sub v24.4s, v10.4s, v14.4s // ................|........................................*............................................................................ + // add v10.4s, v10.4s, v14.4s // ................|.........................................*........................................................................... + // mul v14.4s, v24.4s, v0.s[0] // ................|............................................*........................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|.............................................*....................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ................|.................................................*................................................................... + // sub v24.4s, v11.4s, v15.4s // ................|..............................................*...................................................................... + // add v11.4s, v11.4s, v15.4s // ................|...............................................*..................................................................... + // mul v15.4s, v24.4s, v0.s[0] // ................|..................................................*.................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...................................................*................................................................. + // mls v15.4s, v24.4s, v8.s[0] // ................|.......................................................*............................................................. + // sub v24.4s, v12.4s, v16.4s // ................|................................................*.................................................................... + // add v12.4s, v12.4s, v16.4s // ................|....................................................*................................................................ + // mul v16.4s, v24.4s, v0.s[0] // ................|.....................................................*............................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|......................................................*.............................................................. + // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................................*......................................................... + // cmge v27.4s, v31.4s, v13.4s // ................|........................................................*............................................................ + // cmge v28.4s, v13.4s, v30.4s // ................|.........................................................*........................................................... + // sub v28.4s, v27.4s, v28.4s // ................|............................................................*........................................................ + // mls v13.4s, v28.4s, v8.4s // ................|...............................................................*..................................................... + // cmge v27.4s, v31.4s, v14.4s // ................|.............................................................*....................................................... + // cmge v28.4s, v14.4s, v30.4s // ................|..............................................................*...................................................... + // sub v28.4s, v27.4s, v28.4s // ................|................................................................*.................................................... + // mls v14.4s, v28.4s, v8.4s // ................|...................................................................*................................................. + // cmge v27.4s, v31.4s, v15.4s // ................|.................................................................*................................................... + // cmge v28.4s, v15.4s, v30.4s // ................|..................................................................*.................................................. + // sub v28.4s, v27.4s, v28.4s // ................|....................................................................*................................................ + // mls v15.4s, v28.4s, v8.4s // ................|.......................................................................*............................................. + // cmge v27.4s, v31.4s, v16.4s // ................|.....................................................................*............................................... + // cmge v28.4s, v16.4s, v30.4s // ................|......................................................................*.............................................. + // sub v28.4s, v27.4s, v28.4s // ................|........................................................................*............................................ + // mls v16.4s, v28.4s, v8.4s // ................|...........................................................................*......................................... + // str q13, [x0, #(4*(1024/8))] // ................|.........................................................................*........................................... + // str q14, [x0, #(5*(1024/8))] // ................|............................................................................*........................................ + // str q15, [x0, #(6*(1024/8))] // ................|..............................................................................*...................................... + // str q16, [x0, #(7*(1024/8))] // ................|................................................................................*.................................... + // mul v13.4s, v9.4s, v25.4s // ................|............................*........................................................................................ + // sqrdmulh v9.4s, v9.4s, v26.4s // ................|.............................*....................................................................................... + // mls v13.4s, v9.4s, v8.s[0] // ................|..........................................................*.......................................................... + // mul v14.4s, v10.4s, v25.4s // ................|..........................................................................*.......................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ................|.............................................................................*....................................... + // mls v14.4s, v10.4s, v8.s[0] // ................|.................................................................................*................................... + // mul v15.4s, v11.4s, v25.4s // ................|...............................................................................*..................................... + // sqrdmulh v11.4s, v11.4s, v26.4s // ................|..................................................................................*.................................. + // mls v15.4s, v11.4s, v8.s[0] // ................|......................................................................................*.............................. + // mul v16.4s, v12.4s, v25.4s // ................|...................................................................................*................................. + // sqrdmulh v12.4s, v12.4s, v26.4s // ................|....................................................................................*................................ + // mls v16.4s, v12.4s, v8.s[0] // ................|........................................................................................*............................ + // cmge v27.4s, v31.4s, v13.4s // ................|.....................................................................................*............................... + // cmge v28.4s, v13.4s, v30.4s // ................|.......................................................................................*............................. + // sub v28.4s, v27.4s, v28.4s // ................|.........................................................................................*........................... + // mls v13.4s, v28.4s, v8.4s // ................|............................................................................................*........................ + // cmge v27.4s, v31.4s, v14.4s // ................|..........................................................................................*.......................... + // cmge v28.4s, v14.4s, v30.4s // ................|...........................................................................................*......................... + // sub v28.4s, v27.4s, v28.4s // ................|.............................................................................................*....................... + // mls v14.4s, v28.4s, v8.4s // ................|................................................................................................*.................... + // cmge v27.4s, v31.4s, v15.4s // ................|..............................................................................................*...................... + // cmge v28.4s, v15.4s, v30.4s // ................|...............................................................................................*..................... + // sub v28.4s, v27.4s, v28.4s // ................|.................................................................................................*................... + // mls v15.4s, v28.4s, v8.4s // ................|....................................................................................................*................ + // cmge v27.4s, v31.4s, v16.4s // ................|..................................................................................................*.................. + // cmge v28.4s, v16.4s, v30.4s // ................|...................................................................................................*................. + // sub v28.4s, v27.4s, v28.4s // ................|.....................................................................................................*............... + // mls v16.4s, v28.4s, v8.4s // .*..............|........................................................................................................*............ + // str q13, [x0], #(16) // ................|......................................................................................................*.............. + // str q14, [x0, #(-16 + 1*(1024/8))] // ..*.............|.........................................................................................................*........... + // str q15, [x0, #(-16 + 2*(1024/8))] // ....*...........|...........................................................................................................*......... + // str q16, [x0, #(-16 + 3*(1024/8))] // .............*..|....................................................................................................................* sub count, count, #1 cbnz count, layer123_start - sub v16.4S, v4.4S, v11.4S // ..........*......................................................................... - // gap // .................................................................................... - add v12.4S, v17.4S, v23.4S // .*.................................................................................. - // gap // .................................................................................... - sub v29.4S, v21.4S, v20.4S // ..*................................................................................. - // gap // .................................................................................... - mul v21.4S, v16.4S, v2.S[2] // ................*................................................................... - // gap // .................................................................................... - sub v18.4S, v17.4S, v23.4S // *................................................................................... - // gap // .................................................................................... - mul v11.4S, v29.4S, v2.S[0] // .......*............................................................................ - // gap // .................................................................................... - sub v23.4S, v13.4S, v7.4S // ..................*................................................................. - // gap // .................................................................................... - sqrdmulh v16.4S, v16.4S, v2.S[3] // .................*.................................................................. - // gap // .................................................................................... - sqrdmulh v7.4S, v18.4S, v1.S[3] // ....*............................................................................... - // gap // .................................................................................... - mul v20.4S, v23.4S, v3.S[0] // ......................*............................................................. - // gap // .................................................................................... - sqrdmulh v17.4S, v23.4S, v3.S[1] // ........................*........................................................... - // gap // .................................................................................... - sqrdmulh v23.4S, v29.4S, v2.S[1] // ........*........................................................................... - // gap // .................................................................................... - mul v4.4S, v18.4S, v1.S[2] // ...*................................................................................ - // gap // .................................................................................... - mls v21.4S, v16.4S, v8.S[0] // .....................*.............................................................. - // gap // .................................................................................... - mls v20.4S, v17.4S, v8.S[0] // ...............................*.................................................... - // gap // .................................................................................... - mls v11.4S, v23.4S, v8.S[0] // ...............*.................................................................... - // gap // .................................................................................... - mls v4.4S, v7.4S, v8.S[0] // .........*.......................................................................... - // gap // .................................................................................... - sub v13.4S, v12.4S, v30.4S // .....*.............................................................................. - // gap // .................................................................................... - sub v23.4S, v21.4S, v20.4S // ...................................*................................................ - // gap // .................................................................................... - add v21.4S, v21.4S, v20.4S // ......................................*............................................. - // gap // .................................................................................... - sub v17.4S, v4.4S, v11.4S // ...................*................................................................ - // gap // .................................................................................... - sqrdmulh v16.4S, v23.4S, v1.S[1] // ........................................*........................................... - // gap // .................................................................................... - mul v23.4S, v23.4S, v1.S[0] // .......................................*............................................ - // gap // .................................................................................... - sqrdmulh v20.4S, v17.4S, v0.S[3] // ..........................*......................................................... - // gap // .................................................................................... - mul v17.4S, v17.4S, v0.S[2] // .........................*.......................................................... - // gap // .................................................................................... - mul v24.4S, v13.4S, v0.S[2] // ...........*........................................................................ - // gap // .................................................................................... - mls v23.4S, v16.4S, v8.S[0] // ............................................*....................................... - // gap // .................................................................................... - add v16.4S, v4.4S, v11.4S // ....................*............................................................... - // gap // .................................................................................... - mls v17.4S, v20.4S, v8.S[0] // ................................*................................................... - // gap // .................................................................................... - sqrdmulh v18.4S, v13.4S, v0.S[3] // ............*....................................................................... - // gap // .................................................................................... - add v20.4S, v16.4S, v21.4S // ..........................................*......................................... - // gap // .................................................................................... - sub v13.4S, v16.4S, v21.4S // .........................................*.......................................... - // gap // .................................................................................... - add v16.4S, v17.4S, v23.4S // .....................................................*.............................. - // gap // .................................................................................... - sub v23.4S, v17.4S, v23.4S // .................................................*.................................. - // gap // .................................................................................... - sqrdmulh v21.4S, v20.4S, v26.4S // .......................................................................*............ - // gap // .................................................................................... - mul v11.4S, v16.4S, v25.4S // ............................................................................*....... - // gap // .................................................................................... - mul v29.4S, v20.4S, v25.4S // .....................................................................*.............. - // gap // .................................................................................... - sqrdmulh v16.4S, v16.4S, v26.4S // .............................................................................*...... - // gap // .................................................................................... - sub v5.4S, v19.4S, v6.4S // .................................*.................................................. - // gap // .................................................................................... - sqrdmulh v20.4S, v23.4S, v0.S[1] // .......................................................*............................ - // gap // .................................................................................... - mls v29.4S, v21.4S, v8.S[0] // ...........................................................................*........ - // gap // .................................................................................... - mls v11.4S, v16.4S, v8.S[0] // ................................................................................*... - // gap // .................................................................................... - sqrdmulh v16.4S, v5.4S, v1.S[1] // .....................................*.............................................. - // gap // .................................................................................... - mul v7.4S, v5.4S, v1.S[0] // ....................................*............................................... - // gap // .................................................................................... - str q29, [x0, #128] // .................................................................................*.. - // gap // .................................................................................... - sqrdmulh v4.4S, v13.4S, v0.S[1] // ..............................................*..................................... - // gap // .................................................................................... - mls v24.4S, v18.4S, v8.S[0] // .......................*............................................................ - // gap // .................................................................................... - mls v7.4S, v16.4S, v8.S[0] // ...........................................*........................................ - // gap // .................................................................................... - mul v17.4S, v23.4S, v0.S[0] // ......................................................*............................. - // gap // .................................................................................... - add v22.4S, v12.4S, v30.4S // ......*............................................................................. - // gap // .................................................................................... - mul v13.4S, v13.4S, v0.S[0] // .............................................*...................................... - // gap // .................................................................................... - sub v21.4S, v24.4S, v7.4S // ...............................................*.................................... - // gap // .................................................................................... - sub v16.4S, v22.4S, v10.4S // .............*...................................................................... - // gap // .................................................................................... - add v18.4S, v24.4S, v7.4S // ................................................*................................... - // gap // .................................................................................... - sqrdmulh v30.4S, v21.4S, v0.S[1] // ....................................................*............................... - // gap // .................................................................................... - sqrdmulh v19.4S, v16.4S, v0.S[1] // ............................*....................................................... - // gap // .................................................................................... - mul v7.4S, v16.4S, v0.S[0] // ...........................*........................................................ - // gap // .................................................................................... - sqrdmulh v16.4S, v18.4S, v26.4S // ..........................................................................*......... - // gap // .................................................................................... - mul v23.4S, v18.4S, v25.4S // .........................................................................*.......... - // gap // .................................................................................... - mls v17.4S, v20.4S, v8.S[0] // ............................................................*....................... - // gap // .................................................................................... - mls v7.4S, v19.4S, v8.S[0] // ..................................*................................................. - // gap // .................................................................................... - mul v20.4S, v21.4S, v0.S[0] // ...................................................*................................ - // gap // .................................................................................... - mls v23.4S, v16.4S, v8.S[0] // ..............................................................................*..... - // gap // .................................................................................... - add v10.4S, v22.4S, v10.4S // ..............*..................................................................... - // gap // .................................................................................... - srshr v16.4S, v7.4S, #23 // .........................................................*.......................... - // gap // .................................................................................... - srshr v21.4S, v17.4S, #23 // ................................................................*................... - // gap // .................................................................................... - str q23, [x0, #256] // ..................................................................................*. - // gap // .................................................................................... - mls v7.4S, v16.4S, v8.4S // .............................................................*...................... - // gap // .................................................................................... - mls v17.4S, v21.4S, v8.4S // ...................................................................*................ - // gap // .................................................................................... - mls v20.4S, v30.4S, v8.S[0] // ........................................................*........................... - // gap // .................................................................................... - mul v22.4S, v10.4S, v25.4S // .............................*...................................................... - // gap // .................................................................................... - str q7, [x0, #512] // .................................................................*.................. - // gap // .................................................................................... - mls v13.4S, v4.4S, v8.S[0] // ..................................................*................................. - // gap // .................................................................................... - str q17, [x0, #896] // ........................................................................*........... - // gap // .................................................................................... - srshr v16.4S, v20.4S, #23 // ...............................................................*.................... - // gap // .................................................................................... - sqrdmulh v30.4S, v10.4S, v26.4S // ..............................*..................................................... - // gap // .................................................................................... - srshr v21.4S, v13.4S, #23 // ..........................................................*......................... - // gap // .................................................................................... - mls v20.4S, v16.4S, v8.4S // ..................................................................*................. - // gap // .................................................................................... - str q11, [x0, #384] // ...................................................................................* - // gap // .................................................................................... - mls v22.4S, v30.4S, v8.S[0] // ...........................................................*........................ - // gap // .................................................................................... - mls v13.4S, v21.4S, v8.4S // ..............................................................*..................... - // gap // .................................................................................... - str q20, [x0, #768] // ......................................................................*............. - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - str q22, [x0], #(16) // ...............................................................................*.... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - str q13, [x0, #624] // ....................................................................*............... - // gap // .................................................................................... + sub v5.4S, v7.4S, v6.4S // *........................................................................................................... + // gap // ............................................................................................................ + sub v27.4S, v27.4S, v15.4S // ..................*......................................................................................... + // gap // ............................................................................................................ + sub v10.4S, v17.4S, v28.4S // ..........*................................................................................................. + // gap // ............................................................................................................ + sqrdmulh v14.4S, v5.4S, v1.S[3] // ....*....................................................................................................... + // gap // ............................................................................................................ + mul v13.4S, v27.4S, v3.S[0] // ......................*..................................................................................... + // gap // ............................................................................................................ + sub v12.4S, v12.4S, v21.4S // ..*......................................................................................................... + // gap // ............................................................................................................ + mul v15.4S, v10.4S, v2.S[2] // ................*........................................................................................... + // gap // ............................................................................................................ + sqrdmulh v20.4S, v10.4S, v2.S[3] // .................*.......................................................................................... + // gap // ............................................................................................................ + sqrdmulh v10.4S, v27.4S, v3.S[1] // ........................*................................................................................... + // gap // ............................................................................................................ + sqrdmulh v27.4S, v12.4S, v2.S[1] // ........*................................................................................................... + // gap // ............................................................................................................ + mul v17.4S, v12.4S, v2.S[0] // .......*.................................................................................................... + // gap // ............................................................................................................ + mul v29.4S, v5.4S, v1.S[2] // ...*........................................................................................................ + // gap // ............................................................................................................ + mls v13.4S, v10.4S, v8.S[0] // ...............................*............................................................................ + // gap // ............................................................................................................ + mls v15.4S, v20.4S, v8.S[0] // .....................*...................................................................................... + // gap // ............................................................................................................ + mls v17.4S, v27.4S, v8.S[0] // ...............*............................................................................................ + // gap // ............................................................................................................ + mls v29.4S, v14.4S, v8.S[0] // .........*.................................................................................................. + // gap // ............................................................................................................ + add v5.4S, v7.4S, v6.4S // .*.......................................................................................................... + // gap // ............................................................................................................ + sub v10.4S, v15.4S, v13.4S // ...................................*........................................................................ + // gap // ............................................................................................................ + add v28.4S, v15.4S, v13.4S // ......................................*..................................................................... + // gap // ............................................................................................................ + sub v7.4S, v29.4S, v17.4S // ...................*........................................................................................ + // gap // ............................................................................................................ + sqrdmulh v24.4S, v10.4S, v1.S[1] // ........................................*................................................................... + // gap // ............................................................................................................ + mul v21.4S, v10.4S, v1.S[0] // .......................................*.................................................................... + // gap // ............................................................................................................ + sqrdmulh v13.4S, v7.4S, v0.S[3] // ..........................*................................................................................. + // gap // ............................................................................................................ + mul v22.4S, v7.4S, v0.S[2] // .........................*.................................................................................. + // gap // ............................................................................................................ + add v16.4S, v5.4S, v19.4S // ......*..................................................................................................... + // gap // ............................................................................................................ + mls v21.4S, v24.4S, v8.S[0] // ............................................*............................................................... + // gap // ............................................................................................................ + add v14.4S, v29.4S, v17.4S // ....................*....................................................................................... + // gap // ............................................................................................................ + mls v22.4S, v13.4S, v8.S[0] // ................................*........................................................................... + // gap // ............................................................................................................ + add v15.4S, v16.4S, v4.4S // ..............*............................................................................................. + // gap // ............................................................................................................ + add v24.4S, v14.4S, v28.4S // ..........................................*................................................................. + // gap // ............................................................................................................ + sub v18.4S, v18.4S, v11.4S // .................................*.......................................................................... + // gap // ............................................................................................................ + add v10.4S, v22.4S, v21.4S // .....................................................*...................................................... + // gap // ............................................................................................................ + sqrdmulh v12.4S, v24.4S, v26.4S // ..............................................................................*............................. + // gap // ............................................................................................................ + mul v6.4S, v24.4S, v25.4S // ...........................................................................*................................ + // gap // ............................................................................................................ + sqrdmulh v13.4S, v10.4S, v26.4S // .....................................................................................*...................... + // gap // ............................................................................................................ + mul v7.4S, v10.4S, v25.4S // ....................................................................................*....................... + // gap // ............................................................................................................ + mul v27.4S, v15.4S, v25.4S // .............................*.............................................................................. + // gap // ............................................................................................................ + sqrdmulh v10.4S, v15.4S, v26.4S // ..............................*............................................................................. + // gap // ............................................................................................................ + sub v20.4S, v5.4S, v19.4S // .....*...................................................................................................... + // gap // ............................................................................................................ + mls v7.4S, v13.4S, v8.S[0] // .........................................................................................*.................. + // gap // ............................................................................................................ + sqrdmulh v17.4S, v18.4S, v1.S[1] // .....................................*...................................................................... + // gap // ............................................................................................................ + mls v27.4S, v10.4S, v8.S[0] // ...........................................................*................................................ + // gap // ............................................................................................................ + sqrdmulh v11.4S, v20.4S, v0.S[3] // ............*............................................................................................... + // gap // ............................................................................................................ + cmge v10.4S, v7.4S, v30.4S // ....................................................................................................*....... + // gap // ............................................................................................................ + cmge v24.4S, v31.4S, v7.4S // ...................................................................................................*........ + // gap // ............................................................................................................ + cmge v23.4S, v31.4S, v27.4S // ......................................................................................*..................... + // gap // ............................................................................................................ + sub v10.4S, v24.4S, v10.4S // ......................................................................................................*..... + // gap // ............................................................................................................ + mul v13.4S, v18.4S, v1.S[0] // ....................................*....................................................................... + // gap // ............................................................................................................ + cmge v15.4S, v27.4S, v30.4S // ........................................................................................*................... + // gap // ............................................................................................................ + mls v7.4S, v10.4S, v8.4S // ........................................................................................................*... + // gap // ............................................................................................................ + sub v10.4S, v23.4S, v15.4S // ..........................................................................................*................. + // gap // ............................................................................................................ + mls v13.4S, v17.4S, v8.S[0] // ...........................................*................................................................ + // gap // ............................................................................................................ + mul v5.4S, v20.4S, v0.S[2] // ...........*................................................................................................ + // gap // ............................................................................................................ + str q7, [x0, #384] // ...........................................................................................................* + // gap // ............................................................................................................ + mls v27.4S, v10.4S, v8.4S // .............................................................................................*.............. + // gap // ............................................................................................................ + mls v6.4S, v12.4S, v8.S[0] // ..................................................................................*......................... + // gap // ............................................................................................................ + mls v5.4S, v11.4S, v8.S[0] // .......................*.................................................................................... + // gap // ............................................................................................................ + sub v17.4S, v22.4S, v21.4S // .................................................*.......................................................... + // gap // ............................................................................................................ + str q27, [x0], #(16) // .......................................................................................................*.... + // gap // ............................................................................................................ + cmge v24.4S, v31.4S, v6.4S // ...........................................................................................*................ + // gap // ............................................................................................................ + add v7.4S, v5.4S, v13.4S // ................................................*........................................................... + // gap // ............................................................................................................ + sub v27.4S, v5.4S, v13.4S // ...............................................*............................................................ + // gap // ............................................................................................................ + cmge v10.4S, v6.4S, v30.4S // ............................................................................................*............... + // gap // ............................................................................................................ + mul v13.4S, v7.4S, v25.4S // ................................................................................*........................... + // gap // ............................................................................................................ + sqrdmulh v7.4S, v7.4S, v26.4S // ...................................................................................*........................ + // gap // ............................................................................................................ + sub v10.4S, v24.4S, v10.4S // ..............................................................................................*............. + // gap // ............................................................................................................ + sqrdmulh v12.4S, v17.4S, v0.S[1] // .......................................................*.................................................... + // gap // ............................................................................................................ + mul v15.4S, v17.4S, v0.S[0] // ......................................................*..................................................... + // gap // ............................................................................................................ + mls v13.4S, v7.4S, v8.S[0] // .......................................................................................*.................... + // gap // ............................................................................................................ + mls v6.4S, v10.4S, v8.4S // .................................................................................................*.......... + // gap // ............................................................................................................ + sqrdmulh v11.4S, v27.4S, v0.S[1] // ....................................................*....................................................... + // gap // ............................................................................................................ + mls v15.4S, v12.4S, v8.S[0] // ............................................................*............................................... + // gap // ............................................................................................................ + cmge v24.4S, v31.4S, v13.4S // ...............................................................................................*............ + // gap // ............................................................................................................ + cmge v10.4S, v13.4S, v30.4S // ................................................................................................*........... + // gap // ............................................................................................................ + mul v17.4S, v27.4S, v0.S[0] // ...................................................*........................................................ + // gap // ............................................................................................................ + sub v10.4S, v24.4S, v10.4S // ..................................................................................................*......... + // gap // ............................................................................................................ + sub v12.4S, v14.4S, v28.4S // .........................................*.................................................................. + // gap // ............................................................................................................ + cmge v7.4S, v31.4S, v15.4S // ......................................................................*..................................... + // gap // ............................................................................................................ + cmge v24.4S, v15.4S, v30.4S // .......................................................................*.................................... + // gap // ............................................................................................................ + mls v17.4S, v11.4S, v8.S[0] // ........................................................*................................................... + // gap // ............................................................................................................ + sub v24.4S, v7.4S, v24.4S // .........................................................................*.................................. + // gap // ............................................................................................................ + sqrdmulh v28.4S, v12.4S, v0.S[1] // ..............................................*............................................................. + // gap // ............................................................................................................ + mul v27.4S, v12.4S, v0.S[0] // .............................................*.............................................................. + // gap // ............................................................................................................ + cmge v12.4S, v31.4S, v17.4S // ..................................................................*......................................... + // gap // ............................................................................................................ + cmge v7.4S, v17.4S, v30.4S // ...................................................................*........................................ + // gap // ............................................................................................................ + sub v19.4S, v16.4S, v4.4S // .............*.............................................................................................. + // gap // ............................................................................................................ + sub v7.4S, v12.4S, v7.4S // .....................................................................*...................................... + // gap // ............................................................................................................ + mls v27.4S, v28.4S, v8.S[0] // ..................................................*......................................................... + // gap // ............................................................................................................ + sqrdmulh v11.4S, v19.4S, v0.S[1] // ............................*............................................................................... + // gap // ............................................................................................................ + mls v17.4S, v7.4S, v8.4S // ........................................................................*................................... + // gap // ............................................................................................................ + mul v28.4S, v19.4S, v0.S[0] // ...........................*................................................................................ + // gap // ............................................................................................................ + cmge v12.4S, v27.4S, v30.4S // ...............................................................*............................................ + // gap // ............................................................................................................ + cmge v7.4S, v31.4S, v27.4S // ..............................................................*............................................. + // gap // ............................................................................................................ + str q17, [x0, #752] // ...............................................................................*............................ + // gap // ............................................................................................................ + sub v7.4S, v7.4S, v12.4S // .................................................................*.......................................... + // gap // ............................................................................................................ + mls v28.4S, v11.4S, v8.S[0] // ..................................*......................................................................... + // gap // ............................................................................................................ + str q6, [x0, #112] // .........................................................................................................*.. + // gap // ............................................................................................................ + mls v27.4S, v7.4S, v8.4S // ....................................................................*....................................... + // gap // ............................................................................................................ + mls v15.4S, v24.4S, v8.4S // ............................................................................*............................... + // gap // ............................................................................................................ + cmge v6.4S, v31.4S, v28.4S // .........................................................*.................................................. + // gap // ............................................................................................................ + cmge v24.4S, v28.4S, v30.4S // ..........................................................*................................................. + // gap // ............................................................................................................ + str q27, [x0, #624] // .............................................................................*.............................. + // gap // ............................................................................................................ + sub v24.4S, v6.4S, v24.4S // .............................................................*.............................................. + // gap // ............................................................................................................ + mls v13.4S, v10.4S, v8.4S // .....................................................................................................*...... + // gap // ............................................................................................................ + str q15, [x0, #880] // .................................................................................*.......................... + // gap // ............................................................................................................ + mls v28.4S, v24.4S, v8.4S // ................................................................*........................................... + // gap // ............................................................................................................ + // gap // ............................................................................................................ + // gap // ............................................................................................................ + str q13, [x0, #240] // ..........................................................................................................*. + // gap // ............................................................................................................ + // gap // ............................................................................................................ + // gap // ............................................................................................................ + str q28, [x0, #496] // ..........................................................................*................................. + // gap // ............................................................................................................ // original source code - // sub v16.4S, v17.4S, v23.4S // ....*............................................................................... - // add v23.4S, v17.4S, v23.4S // .*.................................................................................. - // sub v21.4S, v21.4S, v20.4S // ..*................................................................................. - // mul v20.4S, v16.4S, v1.S[2] // ............*....................................................................... - // sqrdmulh v16.4S, v16.4S, v1.S[3] // ........*........................................................................... - // sub v17.4S, v23.4S, v30.4S // .................*.................................................................. - // add v23.4S, v23.4S, v30.4S // .................................................*.................................. - // mul v30.4S, v21.4S, v2.S[0] // .....*.............................................................................. - // sqrdmulh v21.4S, v21.4S, v2.S[1] // ...........*........................................................................ - // mls v20.4S, v16.4S, v8.S[0] // ................*................................................................... - // sub v16.4S, v4.4S, v11.4S // *................................................................................... - // mul v4.4S, v17.4S, v0.S[2] // .........................*.......................................................... - // sqrdmulh v17.4S, v17.4S, v0.S[3] // .............................*...................................................... - // sub v11.4S, v23.4S, v10.4S // ....................................................*............................... - // add v23.4S, v23.4S, v10.4S // ...............................................................*.................... - // mls v30.4S, v21.4S, v8.S[0] // ...............*.................................................................... - // mul v21.4S, v16.4S, v2.S[2] // ...*................................................................................ - // sqrdmulh v16.4S, v16.4S, v2.S[3] // .......*............................................................................ - // sub v13.4S, v13.4S, v7.4S // ......*............................................................................. - // sub v7.4S, v20.4S, v30.4S // ....................*............................................................... - // add v20.4S, v20.4S, v30.4S // ...........................*........................................................ - // mls v21.4S, v16.4S, v8.S[0] // .............*...................................................................... - // mul v16.4S, v13.4S, v3.S[0] // .........*.......................................................................... - // mls v4.4S, v17.4S, v8.S[0] // ..............................................*..................................... - // sqrdmulh v17.4S, v13.4S, v3.S[1] // ..........*......................................................................... - // mul v30.4S, v7.4S, v0.S[2] // ........................*........................................................... - // sqrdmulh v13.4S, v7.4S, v0.S[3] // .......................*............................................................ - // mul v7.4S, v11.4S, v0.S[0] // ........................................................*........................... - // sqrdmulh v11.4S, v11.4S, v0.S[1] // .......................................................*............................ - // mul v10.4S, v23.4S, v25.4S // ......................................................................*............. - // sqrdmulh v23.4S, v23.4S, v26.4S // ...........................................................................*........ - // mls v16.4S, v17.4S, v8.S[0] // ..............*..................................................................... - // mls v30.4S, v13.4S, v8.S[0] // ............................*....................................................... - // sub v17.4S, v19.4S, v6.4S // ......................................*............................................. - // mls v7.4S, v11.4S, v8.S[0] // ............................................................*....................... - // sub v11.4S, v21.4S, v16.4S // ..................*................................................................. - // mul v13.4S, v17.4S, v1.S[0] // ...........................................*........................................ - // sqrdmulh v17.4S, v17.4S, v1.S[1] // ..........................................*......................................... - // add v16.4S, v21.4S, v16.4S // ...................*................................................................ - // mul v21.4S, v11.4S, v1.S[0] // ......................*............................................................. - // sqrdmulh v11.4S, v11.4S, v1.S[1] // .....................*.............................................................. - // sub v19.4S, v20.4S, v16.4S // ...............................*.................................................... - // add v16.4S, v20.4S, v16.4S // ..............................*..................................................... - // mls v13.4S, v17.4S, v8.S[0] // ...............................................*.................................... - // mls v21.4S, v11.4S, v8.S[0] // ..........................*......................................................... - // mul v20.4S, v19.4S, v0.S[0] // ..................................................*................................. - // sqrdmulh v17.4S, v19.4S, v0.S[1] // .............................................*...................................... - // sub v11.4S, v4.4S, v13.4S // ...................................................*................................ - // add v4.4S, v4.4S, v13.4S // .....................................................*.............................. - // sub v13.4S, v30.4S, v21.4S // .................................*.................................................. - // mls v20.4S, v17.4S, v8.S[0] // ........................................................................*........... - // mul v17.4S, v11.4S, v0.S[0] // .............................................................*...................... - // sqrdmulh v11.4S, v11.4S, v0.S[1] // ......................................................*............................. - // add v21.4S, v30.4S, v21.4S // ................................*................................................... - // mul v30.4S, v13.4S, v0.S[0] // ................................................*................................... - // sqrdmulh v13.4S, v13.4S, v0.S[1] // .......................................*............................................ - // mls v17.4S, v11.4S, v8.S[0] // .....................................................................*.............. - // srshr v11.4S, v7.4S, #23 // ................................................................*................... - // srshr v19.4S, v20.4S, #23 // ............................................................................*....... - // mls v10.4S, v23.4S, v8.S[0] // ...............................................................................*.... - // mls v30.4S, v13.4S, v8.S[0] // ...........................................................*........................ - // mls v7.4S, v11.4S, v8.4S // ...................................................................*................ - // mls v20.4S, v19.4S, v8.4S // ................................................................................*... - // srshr v23.4S, v17.4S, #23 // ..........................................................................*......... - // srshr v11.4S, v30.4S, #23 // .................................................................*.................. - // str q7, [x0, #512] // .......................................................................*............ - // mls v17.4S, v23.4S, v8.4S // .............................................................................*...... - // mls v30.4S, v11.4S, v8.4S // ....................................................................*............... - // str q20, [x0, #640] // ...................................................................................* - // mul v23.4S, v16.4S, v25.4S // ....................................*............................................... - // str q17, [x0, #768] // .................................................................................*.. - // sqrdmulh v16.4S, v16.4S, v26.4S // ..................................*................................................. - // str q30, [x0, #896] // .........................................................................*.......... - // mul v20.4S, v4.4S, v25.4S // ..........................................................*......................... - // sqrdmulh v17.4S, v4.4S, v26.4S // .........................................................*.......................... - // mls v23.4S, v16.4S, v8.S[0] // ........................................*........................................... - // mul v16.4S, v21.4S, v25.4S // ...................................*................................................ - // sqrdmulh v21.4S, v21.4S, v26.4S // .....................................*.............................................. - // mls v20.4S, v17.4S, v8.S[0] // ..............................................................*..................... - // str q10, [x0], #(16) // ..................................................................................*. - // mls v16.4S, v21.4S, v8.S[0] // .........................................*.......................................... - // str q23, [x0, #112] // ............................................*....................................... - // str q20, [x0, #240] // ..................................................................*................. - // str q16, [x0, #368] // ..............................................................................*..... + // sub v10.4S, v7.4S, v6.4S // *........................................................................................................... + // add v24.4S, v7.4S, v6.4S // ................*........................................................................................... + // sub v7.4S, v12.4S, v21.4S // .....*...................................................................................................... + // mul v13.4S, v10.4S, v1.S[2] // ...........*................................................................................................ + // sqrdmulh v10.4S, v10.4S, v1.S[3] // ...*........................................................................................................ + // sub v6.4S, v24.4S, v19.4S // ......................................*..................................................................... + // add v24.4S, v24.4S, v19.4S // ........................*................................................................................... + // mul v12.4S, v7.4S, v2.S[0] // ..........*................................................................................................. + // sqrdmulh v7.4S, v7.4S, v2.S[1] // .........*.................................................................................................. + // mls v13.4S, v10.4S, v8.S[0] // ...............*............................................................................................ + // sub v10.4S, v17.4S, v28.4S // ..*......................................................................................................... + // mul v17.4S, v6.4S, v0.S[2] // ....................................................*....................................................... + // sqrdmulh v6.4S, v6.4S, v0.S[3] // ..........................................*................................................................. + // sub v28.4S, v24.4S, v4.4S // .....................................................................................*...................... + // add v24.4S, v24.4S, v4.4S // ............................*............................................................................... + // mls v12.4S, v7.4S, v8.S[0] // ..............*............................................................................................. + // mul v7.4S, v10.4S, v2.S[2] // ......*..................................................................................................... + // sqrdmulh v10.4S, v10.4S, v2.S[3] // .......*.................................................................................................... + // sub v27.4S, v27.4S, v15.4S // .*.......................................................................................................... + // sub v15.4S, v13.4S, v12.4S // ...................*........................................................................................ + // add v13.4S, v13.4S, v12.4S // ..........................*................................................................................. + // mls v7.4S, v10.4S, v8.S[0] // .............*.............................................................................................. + // mul v10.4S, v27.4S, v3.S[0] // ....*....................................................................................................... + // mls v17.4S, v6.4S, v8.S[0] // ........................................................*................................................... + // sqrdmulh v6.4S, v27.4S, v3.S[1] // ........*................................................................................................... + // mul v12.4S, v15.4S, v0.S[2] // .......................*.................................................................................... + // sqrdmulh v27.4S, v15.4S, v0.S[3] // ......................*..................................................................................... + // mul v15.4S, v28.4S, v0.S[0] // ..........................................................................................*................. + // sqrdmulh v28.4S, v28.4S, v0.S[1] // ........................................................................................*................... + // mul v21.4S, v24.4S, v25.4S // ....................................*....................................................................... + // sqrdmulh v24.4S, v24.4S, v26.4S // .....................................*...................................................................... + // mls v10.4S, v6.4S, v8.S[0] // ............*............................................................................................... + // mls v12.4S, v27.4S, v8.S[0] // ...........................*................................................................................ + // sub v6.4S, v18.4S, v11.4S // ..............................*............................................................................. + // mls v15.4S, v28.4S, v8.S[0] // ...............................................................................................*............ + // sub v28.4S, v7.4S, v10.4S // .................*.......................................................................................... + // mul v27.4S, v6.4S, v1.S[0] // ...............................................*............................................................ + // sqrdmulh v6.4S, v6.4S, v1.S[1] // ........................................*................................................................... + // add v10.4S, v7.4S, v10.4S // ..................*......................................................................................... + // mul v7.4S, v28.4S, v1.S[0] // .....................*...................................................................................... + // sqrdmulh v28.4S, v28.4S, v1.S[1] // ....................*....................................................................................... + // sub v18.4S, v13.4S, v10.4S // ............................................................................*............................... + // add v10.4S, v13.4S, v10.4S // .............................*.............................................................................. + // mls v27.4S, v6.4S, v8.S[0] // ...................................................*........................................................ + // mls v7.4S, v28.4S, v8.S[0] // .........................*.................................................................................. + // mul v13.4S, v18.4S, v0.S[0] // ..................................................................................*......................... + // sqrdmulh v6.4S, v18.4S, v0.S[1] // .................................................................................*.......................... + // sub v28.4S, v17.4S, v27.4S // .............................................................*.............................................. + // add v17.4S, v17.4S, v27.4S // ............................................................*............................................... + // sub v27.4S, v12.4S, v7.4S // .........................................................*.................................................. + // mls v13.4S, v6.4S, v8.S[0] // .......................................................................................*.................... + // mul v6.4S, v28.4S, v0.S[0] // ..........................................................................*................................. + // sqrdmulh v28.4S, v28.4S, v0.S[1] // ......................................................................*..................................... + // add v7.4S, v12.4S, v7.4S // ...............................*............................................................................ + // mul v12.4S, v27.4S, v0.S[0] // ...................................................................*........................................ + // sqrdmulh v27.4S, v27.4S, v0.S[1] // ..................................................................*......................................... + // mls v6.4S, v28.4S, v8.S[0] // ...............................................................................*............................ + // cmge v28.4S, v31.4S, v15.4S // ...................................................................................................*........ + // cmge v18.4S, v15.4S, v30.4S // ....................................................................................................*....... + // mls v21.4S, v24.4S, v8.S[0] // .........................................*.................................................................. + // mls v12.4S, v27.4S, v8.S[0] // .......................................................................*.................................... + // sub v24.4S, v28.4S, v18.4S // ......................................................................................................*..... + // cmge v28.4S, v31.4S, v13.4S // ............................................................................................*............... + // cmge v27.4S, v13.4S, v30.4S // ...........................................................................................*................ + // mls v15.4S, v24.4S, v8.4S // .........................................................................................................*.. + // sub v24.4S, v28.4S, v27.4S // ..............................................................................................*............. + // cmge v28.4S, v31.4S, v6.4S // ...................................................................................*........................ + // cmge v27.4S, v6.4S, v30.4S // ....................................................................................*....................... + // mls v13.4S, v24.4S, v8.4S // .................................................................................................*.......... + // sub v24.4S, v28.4S, v27.4S // ......................................................................................*..................... + // cmge v28.4S, v31.4S, v12.4S // .............................................................................*.............................. + // cmge v27.4S, v12.4S, v30.4S // ..............................................................................*............................. + // mls v6.4S, v24.4S, v8.4S // .........................................................................................*.................. + // sub v24.4S, v28.4S, v27.4S // ................................................................................*........................... + // str q15, [x0, #512] // ...........................................................................................................* + // mul v28.4S, v10.4S, v25.4S // .................................*.......................................................................... + // mls v12.4S, v24.4S, v8.4S // ..................................................................................................*......... + // str q13, [x0, #640] // .....................................................................................................*...... + // sqrdmulh v10.4S, v10.4S, v26.4S // ................................*........................................................................... + // str q6, [x0, #768] // .............................................................................................*.............. + // mul v24.4S, v17.4S, v25.4S // ...............................................................*............................................ + // str q12, [x0, #896] // ........................................................................................................*... + // mls v28.4S, v10.4S, v8.S[0] // .......................................................*.................................................... + // sqrdmulh v10.4S, v17.4S, v26.4S // ................................................................*........................................... + // mul v13.4S, v7.4S, v25.4S // ...................................*........................................................................ + // sqrdmulh v7.4S, v7.4S, v26.4S // ..................................*......................................................................... + // cmge v6.4S, v31.4S, v21.4S // .............................................*.............................................................. + // mls v24.4S, v10.4S, v8.S[0] // ....................................................................*....................................... + // cmge v10.4S, v21.4S, v30.4S // ................................................*........................................................... + // mls v13.4S, v7.4S, v8.S[0] // .......................................*.................................................................... + // sub v10.4S, v6.4S, v10.4S // ..................................................*......................................................... + // cmge v7.4S, v31.4S, v28.4S // ...........................................................*................................................ + // cmge v6.4S, v28.4S, v30.4S // ..............................................................*............................................. + // mls v21.4S, v10.4S, v8.4S // ......................................................*..................................................... + // sub v10.4S, v7.4S, v6.4S // .................................................................*.......................................... + // cmge v7.4S, v31.4S, v24.4S // ........................................................................*................................... + // cmge v6.4S, v24.4S, v30.4S // .........................................................................*.................................. + // mls v28.4S, v10.4S, v8.4S // .....................................................................*...................................... + // sub v10.4S, v7.4S, v6.4S // ...........................................................................*................................ + // cmge v7.4S, v31.4S, v13.4S // ............................................*............................................................... + // cmge v6.4S, v13.4S, v30.4S // ...........................................*................................................................ + // mls v24.4S, v10.4S, v8.4S // .......................................................................................................*.... + // sub v10.4S, v7.4S, v6.4S // ..............................................*............................................................. + // str q21, [x0], #(16) // ..........................................................*................................................. + // mls v13.4S, v10.4S, v8.4S // .................................................*.......................................................... + // str q28, [x0, #112] // ................................................................................................*........... + // str q24, [x0, #240] // ..........................................................................................................*. + // str q13, [x0, #368] // .....................................................*...................................................... pop_stack diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a72.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a72.s index 87a99b7..917ba79 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a72.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a72.s @@ -371,6 +371,8 @@ _intt_dilithium_123_45678_manual_ld4_opt_a72: consts .req v8 qform_consts .req q8 + modulus .req v29 + ASM_LOAD(r_ptr0, roots_l345) ASM_LOAD(r_ptr1, roots_l67) @@ -393,2002 +395,2167 @@ _intt_dilithium_123_45678_manual_ld4_opt_a72: qform_root3_tw .req q7 .p2align 2 - ldr q19, [x1, #48] // .......*.............. - ldr q27, [x1, #32] // ......*............... - // gap // ...................... - ldr q5, [x1, #16] // .....*................ - ldr q20, [x1, #0] // ....*................. - // gap // ...................... - ldr q30, [x5, #64] // ..*................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - trn2 v17.4S, v27.4S, v19.4S // ...........*.......... - trn1 v21.4S, v27.4S, v19.4S // ..........*........... - // gap // ...................... - trn2 v14.4S, v20.4S, v5.4S // .........*............ - trn1 v5.4S, v20.4S, v5.4S // ........*............. - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - ldr q20, [x5, #32] // *..................... - // gap // ...................... - // gap // ...................... - trn1 v24.2D, v5.2D, v21.2D // ............*......... - trn1 v6.2D, v14.2D, v17.2D // .............*........ - // gap // ...................... - trn2 v9.2D, v14.2D, v17.2D // ...............*...... - ldr q17, [x5, #80] // ...*.................. - // gap // ...................... - trn2 v5.2D, v5.2D, v21.2D // ..............*....... - // gap // ...................... - // gap // ...................... - sub v22.4S, v24.4S, v6.4S // ................*..... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - sub v26.4S, v5.4S, v9.4S // .................*.... - // gap // ...................... - // gap // ...................... - mul v2.4S, v22.4S, v20.4S // ....................*. - add v15.4S, v5.4S, v9.4S // ...................*.. - ldr q20, [x5, #48] // .*.................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - sqrdmulh v13.4S, v26.4S, v17.4S // .....................* - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - sqrdmulh v28.4S, v22.4S, v20.4S // ..................*... - // gap // ...................... - // gap // ...................... + // gap // ............................................................................................................................................................ + ldr q26, [x1, #16] // ........*................................................................................................................................................... + ldr q10, [x1, #0] // .........*.................................................................................................................................................. + ldr q17, [x1, #32] // .......*.................................................................................................................................................... + // gap // ............................................................................................................................................................ + ldr q30, [x1, #48] // ......*..................................................................................................................................................... + ldr q22, [x2, #48] // ....................................*....................................................................................................................... + ldr q21, [x5, #96] // .............*.............................................................................................................................................. + // gap // ............................................................................................................................................................ + ldr q12, [x2, #16] // .......................................*.................................................................................................................... + ldr q25, [x2, #0] // ........................................*................................................................................................................... + // gap // ............................................................................................................................................................ + ldr q11, [x2, #32] // .....................................*...................................................................................................................... + trn1 v18.4S, v10.4S, v26.4S // ................*........................................................................................................................................... + trn2 v1.4S, v10.4S, v26.4S // .................*.......................................................................................................................................... + trn2 v5.4S, v17.4S, v30.4S // ...................*........................................................................................................................................ + trn1 v7.4S, v17.4S, v30.4S // ..............*............................................................................................................................................. + ldr q24, [x5, #80] // ............*............................................................................................................................................... + ldr q20, [x5, #32] // ..*......................................................................................................................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + trn2 v16.4S, v25.4S, v12.4S // ............................................*............................................................................................................... + trn1 v9.4S, v25.4S, v12.4S // ..............................................*............................................................................................................. + // gap // ............................................................................................................................................................ + trn2 v0.2D, v18.2D, v7.2D // ......................*..................................................................................................................................... + trn2 v23.2D, v1.2D, v5.2D // ........................*................................................................................................................................... + ldr q29, [x5, #160] // *........................................................................................................................................................... + trn2 v19.4S, v11.4S, v22.4S // .............................................*.............................................................................................................. + trn1 v31.4S, v11.4S, v22.4S // ..........................................*................................................................................................................. + ldr q2, [x5, #48] // ...............*............................................................................................................................................ + trn1 v4.2D, v1.2D, v5.2D // .........................*.................................................................................................................................. + trn1 v3.2D, v18.2D, v7.2D // ..........................*................................................................................................................................. + // gap // ............................................................................................................................................................ + ldr q30, [x5, #64] // .......................*.................................................................................................................................... + sub v5.4S, v0.4S, v23.4S // ...........................*................................................................................................................................ + // gap // ............................................................................................................................................................ + trn1 v15.2D, v16.2D, v19.2D // ..................................................*......................................................................................................... + trn1 v12.2D, v9.2D, v31.2D // ...................................................*........................................................................................................ + // gap // ............................................................................................................................................................ + sub v25.4S, v3.4S, v4.4S // ............................*............................................................................................................................... + trn2 v27.2D, v16.2D, v19.2D // ....................................................*....................................................................................................... + // gap // ............................................................................................................................................................ + trn2 v22.2D, v9.2D, v31.2D // ......................................................*..................................................................................................... + sqrdmulh v18.4S, v5.4S, v24.4S // ..............................*............................................................................................................................. + // gap // ............................................................................................................................................................ + sub v13.4S, v12.4S, v15.4S // ........................................................*................................................................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + add v9.4S, v12.4S, v15.4S // .........................................................*.................................................................................................. + sqrdmulh v7.4S, v25.4S, v2.4S // ................................*........................................................................................................................... + // gap // ............................................................................................................................................................ + add v14.4S, v22.4S, v27.4S // ............................................................*............................................................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mul v30.4S, v5.4S, v30.4S // ..................................*......................................................................................................................... + add v4.4S, v3.4S, v4.4S // ...............................*............................................................................................................................ + // gap // ............................................................................................................................................................ + sub v11.4S, v22.4S, v27.4S // ...........................................................*................................................................................................ + ldr q15, [x5, #112] // .*.......................................................................................................................................................... + // gap // ............................................................................................................................................................ + mul v26.4S, v25.4S, v20.4S // ......................................*..................................................................................................................... + add v31.4S, v0.4S, v23.4S // .............................*.............................................................................................................................. + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v26.4S, v7.4S, v8.S[0] // .........................................*.................................................................................................................. + ldr q25, [x5, #16] // .................................*.......................................................................................................................... + ldr q12, [x5], #(12*16) // ..................*......................................................................................................................................... + sub v20.4S, v4.4S, v31.4S // ...................................*........................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v30.4S, v18.4S, v8.S[0] // ...........................................*................................................................................................................ + sub v23.4S, v9.4S, v14.4S // ................................................................*........................................................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + add v7.4S, v4.4S, v31.4S // ................................................*........................................................................................................... + ldr q3, [x5, #-48] // ....................*....................................................................................................................................... + mul v2.4S, v20.4S, v12.4S // .................................................*.......................................................................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + add v5.4S, v9.4S, v14.4S // .....................................................................................*...................................................................... + sqrdmulh v18.4S, v20.4S, v25.4S // ...............................................*............................................................................................................ + // gap // ............................................................................................................................................................ + sub v10.4S, v26.4S, v30.4S // .....................................................*...................................................................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + sqrdmulh v4.4S, v13.4S, v3.4S // .....................................................................*...................................................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + ldr q14, [x5, #-64] // ..........*................................................................................................................................................. + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + sqrdmulh v19.4S, v10.4S, v25.4S // ..........................................................*................................................................................................. + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v2.4S, v18.4S, v8.S[0] // .......................................................*.................................................................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mul v16.4S, v10.4S, v12.4S // ...............................................................*............................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v16.4S, v19.4S, v8.S[0] // .................................................................*.......................................................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + add v0.4S, v26.4S, v30.4S // ..............................................................*............................................................................................. + ldr q30, [x4, #48] // ...*........................................................................................................................................................ + // gap // ............................................................................................................................................................ + mul v1.4S, v11.4S, v29.4S // ...................................................................*........................................................................................ + ldr q29, [x5, #-16] // ...........*................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + trn2 v28.4S, v7.4S, v0.4S // ....................................................................*....................................................................................... + mul v31.4S, v23.4S, v21.4S // ............................................................................*............................................................................... + // gap // ............................................................................................................................................................ + trn2 v9.4S, v2.4S, v16.4S // ......................................................................*..................................................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mul v18.4S, v13.4S, v14.4S // .............................................................*.............................................................................................. + trn1 v16.4S, v2.4S, v16.4S // ........................................................................*................................................................................... + // gap // ............................................................................................................................................................ + trn1 v24.4S, v7.4S, v0.4S // ..................................................................*......................................................................................... + ldr q0, [x4, #16] // .....*...................................................................................................................................................... + // gap // ............................................................................................................................................................ + sqrdmulh v17.4S, v11.4S, v29.4S // .......................................................................*.................................................................................... + trn1 v12.2D, v28.2D, v9.2D // .........................................................................*.................................................................................. + // gap // ............................................................................................................................................................ + trn2 v20.2D, v28.2D, v9.2D // ...........................................................................*................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + sqrdmulh v6.4S, v23.4S, v15.4S // ..........................................................................*................................................................................. + trn1 v14.2D, v24.2D, v16.2D // ..............................................................................*............................................................................. + // gap // ............................................................................................................................................................ + trn2 v26.2D, v24.2D, v16.2D // .............................................................................*.............................................................................. + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v18.4S, v4.4S, v8.S[0] // .................................................................................*.......................................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + add v7.4S, v14.4S, v12.4S // ..................................................................................*......................................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v1.4S, v17.4S, v8.S[0] // ...............................................................................*............................................................................ + add v9.4S, v26.4S, v20.4S // ................................................................................*........................................................................... + // gap // ............................................................................................................................................................ + sub v23.4S, v14.4S, v12.4S // ...........................................................................................*................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v31.4S, v6.4S, v8.S[0] // ....................................................................................*....................................................................... + ldr q6, [x4], #64 // .....................*...................................................................................................................................... + // gap // ............................................................................................................................................................ + sub v28.4S, v7.4S, v9.4S // ......................................................................................*..................................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + add v29.4S, v7.4S, v9.4S // ............................................................................................*............................................................... + sqrdmulh v16.4S, v23.4S, v0.S[3] // ...................................................................................................*........................................................ + // gap // ............................................................................................................................................................ + sub v3.4S, v18.4S, v1.4S // ........................................................................................*................................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + add v17.4S, v18.4S, v1.4S // .........................................................................................*.................................................................. + mul v2.4S, v23.4S, v0.S[2] // ............................................................................................................*............................................... + // gap // ............................................................................................................................................................ + srshr v27.4S, v29.4S, #23 // ..................................................................................................*......................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + sqrdmulh v23.4S, v3.4S, v15.4S // .............................................................................................*.............................................................. + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + trn2 v1.4S, v5.4S, v17.4S // ................................................................................................*........................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + trn1 v19.4S, v5.4S, v17.4S // ....................................................................................................*....................................................... + sqrdmulh v4.4S, v28.4S, v6.S[3] // ..........................................................................................*................................................................. + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mul v7.4S, v3.4S, v21.4S // ...............................................................................................*............................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v7.4S, v23.4S, v8.S[0] // .................................................................................................*.......................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + sub v10.4S, v26.4S, v20.4S // ...................................................................................*........................................................................ + mul v15.4S, v28.4S, v6.S[2] // .........................................................................................................*.................................................. + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v2.4S, v16.4S, v8.S[0] // ...............................................................................................................*............................................ + ldr q9, [x4, #-32] // ....*....................................................................................................................................................... + // gap // ............................................................................................................................................................ + trn1 v14.4S, v31.4S, v7.4S // ......................................................................................................*..................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + trn2 v16.4S, v31.4S, v7.4S // ........................................................................................................*................................................... + mls v15.4S, v4.4S, v8.S[0] // ..................................................................................................................*......................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + sqrdmulh v4.4S, v10.4S, v9.S[1] // ..............................................................................................*............................................................. + trn1 v23.2D, v19.2D, v14.2D // .............................................................................................................*.............................................. + // gap // ............................................................................................................................................................ + trn2 v3.2D, v19.2D, v14.2D // ..........................................................................................................*................................................. + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v29.4S, v27.4S, v8.4S // .....................................................................................................*...................................................... + trn2 v14.2D, v1.2D, v16.2D // ...........................................................................................................*................................................ + // gap // ............................................................................................................................................................ + trn1 v25.2D, v1.2D, v16.2D // ..............................................................................................................*............................................. + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mul v28.4S, v10.4S, v9.S[0] // .......................................................................................*.................................................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + sub v13.4S, v3.4S, v14.4S // ................................................................................................................*........................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v28.4S, v4.4S, v8.S[0] // .......................................................................................................*.................................................... + add v26.4S, v3.4S, v14.4S // .................................................................................................................*.......................................... + // gap // ............................................................................................................................................................ + add v12.4S, v23.4S, v25.4S // ...................................................................................................................*........................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + sub v18.4S, v23.4S, v25.4S // ....................................................................................................................*....................................... + sqrdmulh v19.4S, v13.4S, v30.S[1] // .....................................................................................................................*...................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mul v22.4S, v13.4S, v30.S[0] // .......................................................................................................................*.................................... + sub v17.4S, v12.4S, v26.4S // ...............................................................................................................................*............................ + // gap // ............................................................................................................................................................ + add v23.4S, v2.4S, v28.4S // ..................................................................................................................................*......................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + add v10.4S, v12.4S, v26.4S // ........................................................................................................................*................................... + mul v14.4S, v18.4S, v9.S[2] // ..............................................................................................................................*............................. + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + sqrdmulh v4.4S, v18.4S, v9.S[3] // ............................................................................................................................*............................... + srshr v13.4S, v23.4S, #23 // .......................................................................................................................................*.................... + // gap // ............................................................................................................................................................ + srshr v27.4S, v10.4S, #23 // ...........................................................................................................................*................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v22.4S, v19.4S, v8.S[0] // ...................................................................................................................................*........................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + sqrdmulh v25.4S, v17.4S, v0.S[1] // ......................................................................................................................................*..................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v14.4S, v4.4S, v8.S[0] // .................................................................................................................................*.......................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v23.4S, v13.4S, v8.4S // ............................................................................................................................................*............... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v10.4S, v27.4S, v8.4S // ................................................................................................................................*........................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + sub v7.4S, v14.4S, v22.4S // ..........................................................................................................................................*................. + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mul v11.4S, v17.4S, v0.S[0] // ........................................................................................................................................*................... + add v21.4S, v14.4S, v22.4S // ...........................................................................................................................................*................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + sqrdmulh v5.4S, v7.4S, v0.S[1] // .............................................................................................................................................*.............. + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + srshr v18.4S, v21.4S, #23 // ..............................................................................................................................................*............. + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v11.4S, v25.4S, v8.S[0] // ...............................................................................................................................................*............ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + sub v14.4S, v2.4S, v28.4S // ......................................................................................................................*..................................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v21.4S, v18.4S, v8.4S // ................................................................................................................................................*........... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + sqrdmulh v12.4S, v14.4S, v6.S[3] // .........................................................................................................................*.................................. + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + add v26.4S, v15.4S, v11.4S // ...................................................................................................................................................*........ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mul v18.4S, v7.4S, v0.S[0] // .................................................................................................................................................*.......... + sub v0.4S, v15.4S, v11.4S // ..................................................................................................................................................*......... + // gap // ............................................................................................................................................................ + add v11.4S, v29.4S, v10.4S // .....................................................................................................................................*...................... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v18.4S, v5.4S, v8.S[0] // ....................................................................................................................................................*....... + sub v5.4S, v29.4S, v10.4S // ....................................................................................................................................*....................... + str q26, [x1, #32] // ........................................................................................................................................................*... + add v10.4S, v23.4S, v21.4S // ......................................................................................................................................................*..... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mul v27.4S, v14.4S, v6.S[2] // ..........................................................................................................................*................................. + str q11, [x1], #(16*4) // .........................................................................................................................................*.................. + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mls v27.4S, v12.4S, v8.S[0] // .............................................................................................................................*.............................. + str q10, [x1, #-48] // ..........................................................................................................................................................*. + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + sub v2.4S, v23.4S, v21.4S // .....................................................................................................................................................*...... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + sqrdmulh v19.4S, v5.4S, v6.S[1] // .......................................................................................................................................................*.... + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + // gap // ............................................................................................................................................................ + mul v13.4S, v2.4S, v6.S[0] // .........................................................................................................................................................*.. + add v29.4S, v27.4S, v18.4S // ...........................................................................................................................................................* + // gap // ............................................................................................................................................................ // original source code - // ldr q3, [x5, #32] // .........*............ - // ldr q10, [x5, #48] // ...................*.. - // ldr q30, [x5, #64] // ....*................. - // ldr q22, [x5, #80] // .............*........ - // ldr q21, [x1, #0] // ...*.................. - // ldr q31, [x1, #16] // ..*................... - // ldr q15, [x1, #32] // .*.................... - // ldr q29, [x1, #48] // *..................... - // trn1 v13.4S, v21.4S, v31.4S // ........*............. - // trn2 v21.4S, v21.4S, v31.4S // .......*.............. - // trn1 v31.4S, v15.4S, v29.4S // ......*............... - // trn2 v26.4S, v15.4S, v29.4S // .....*................ - // trn1 v24.2D, v13.2D, v31.2D // ..........*........... - // trn1 v6.2D, v21.2D, v26.2D // ...........*.......... - // trn2 v31.2D, v13.2D, v31.2D // ..............*....... - // trn2 v21.2D, v21.2D, v26.2D // ............*......... - // sub v14.4S, v24.4S, v6.4S // ...............*...... - // sub v26.4S, v31.4S, v21.4S // ................*..... - // sqrdmulh v28.4S, v14.4S, v10.4S // .....................* - // add v15.4S, v31.4S, v21.4S // ..................*... - // mul v2.4S, v14.4S, v3.4S // .................*.... - // sqrdmulh v13.4S, v26.4S, v22.4S // ....................*. + // ldr q16, [x5, #160] // ...................*........................................................................................................................................ + // ldr q7, [x5, #112] // ........................................*................................................................................................................... + // ldr q25, [x5, #32] // ..............*............................................................................................................................................. + // ldr q12, [x4, #48] // ..............................................................*............................................................................................. + // ldr q24, [x4, #32] // .....................................................................................................*...................................................... + // ldr q3, [x4, #16] // .......................................................................*.................................................................................... + // ldr q17, [x1, #48] // ...*........................................................................................................................................................ + // ldr q11, [x1, #32] // ..*......................................................................................................................................................... + // ldr q26, [x1, #16] // *........................................................................................................................................................... + // ldr q21, [x1, #0] // .*.......................................................................................................................................................... + // ldr q31, [x5, #128] // ........................................................*................................................................................................... + // ldr q18, [x5, #176] // ................................................................*........................................................................................... + // ldr q20, [x5, #80] // .............*.............................................................................................................................................. + // ldr q22, [x5, #96] // .....*...................................................................................................................................................... + // trn1 v14.4S, v11.4S, v17.4S // ............*............................................................................................................................................... + // ldr q27, [x5, #48] // ......................*..................................................................................................................................... + // trn1 v30.4S, v21.4S, v26.4S // .........*.................................................................................................................................................. + // trn2 v23.4S, v21.4S, v26.4S // ..........*................................................................................................................................................. + // ldr q4, [x5], #(12*16) // .............................................*.............................................................................................................. + // trn2 v28.4S, v11.4S, v17.4S // ...........*................................................................................................................................................ + // ldr q17, [x5, #-48] // ..................................................*......................................................................................................... + // ldr q6, [x4], #64 // ....................................................................................*....................................................................... + // trn2 v26.2D, v30.2D, v14.2D // .................*.......................................................................................................................................... + // ldr q10, [x5, #-128] // .........................*.................................................................................................................................. + // trn2 v1.2D, v23.2D, v28.2D // ..................*......................................................................................................................................... + // trn1 v0.2D, v23.2D, v28.2D // .......................*.................................................................................................................................... + // trn1 v23.2D, v30.2D, v14.2D // ........................*................................................................................................................................... + // sub v29.4S, v26.4S, v1.4S // ..........................*................................................................................................................................. + // sub v21.4S, v23.4S, v0.4S // .............................*.............................................................................................................................. + // add v14.4S, v26.4S, v1.4S // ..........................................*................................................................................................................. + // sqrdmulh v9.4S, v29.4S, v20.4S // ................................*........................................................................................................................... + // add v30.4S, v23.4S, v0.4S // ......................................*..................................................................................................................... + // sqrdmulh v23.4S, v21.4S, v27.4S // ...................................*........................................................................................................................ + // ldr q13, [x5, #-176] // ............................................*............................................................................................................... + // mul v15.4S, v29.4S, v10.4S // .....................................*...................................................................................................................... + // sub v29.4S, v30.4S, v14.4S // ..............................................*............................................................................................................. + // ldr q19, [x2, #48] // ....*....................................................................................................................................................... + // ldr q5, [x2, #32] // ........*................................................................................................................................................... + // mul v28.4S, v21.4S, v25.4S // .........................................*.................................................................................................................. + // ldr q1, [x2, #16] // ......*..................................................................................................................................................... + // ldr q0, [x2, #0] // .......*.................................................................................................................................................... + // mls v28.4S, v23.4S, v8.S[0] // ...........................................*................................................................................................................ + // trn1 v11.4S, v5.4S, v19.4S // .....................*...................................................................................................................................... + // mls v15.4S, v9.4S, v8.S[0] // ...............................................*............................................................................................................ + // trn2 v9.4S, v0.4S, v1.4S // ...............*............................................................................................................................................ + // trn2 v10.4S, v5.4S, v19.4S // ....................*....................................................................................................................................... + // trn1 v0.4S, v0.4S, v1.4S // ................*........................................................................................................................................... + // sqrdmulh v19.4S, v29.4S, v13.4S // .....................................................*...................................................................................................... + // add v27.4S, v30.4S, v14.4S // .................................................*.......................................................................................................... + // mul v25.4S, v29.4S, v4.4S // ...................................................*........................................................................................................ + // trn1 v30.2D, v9.2D, v10.2D // ...........................*................................................................................................................................ + // trn1 v21.2D, v0.2D, v11.2D // ............................*............................................................................................................................... + // trn2 v26.2D, v9.2D, v10.2D // ..............................*............................................................................................................................. + // sub v5.4S, v28.4S, v15.4S // ......................................................*..................................................................................................... + // trn2 v2.2D, v0.2D, v11.2D // ...............................*............................................................................................................................ + // mls v25.4S, v19.4S, v8.S[0] // ..........................................................*................................................................................................. + // sub v0.4S, v21.4S, v30.4S // .................................*.......................................................................................................................... + // add v21.4S, v21.4S, v30.4S // ..................................*......................................................................................................................... + // sqrdmulh v29.4S, v5.4S, v13.4S // .........................................................*.................................................................................................. + // sub v23.4S, v2.4S, v26.4S // .......................................*.................................................................................................................... + // add v2.4S, v2.4S, v26.4S // ....................................*....................................................................................................................... + // mul v14.4S, v0.4S, v31.4S // ....................................................................*....................................................................................... + // add v19.4S, v28.4S, v15.4S // .............................................................*.............................................................................................. + // mul v20.4S, v5.4S, v4.4S // ...........................................................*................................................................................................ + // sub v5.4S, v21.4S, v2.4S // ................................................*........................................................................................................... + // mls v20.4S, v29.4S, v8.S[0] // ............................................................*............................................................................................... + // trn1 v9.4S, v27.4S, v19.4S // ......................................................................*..................................................................................... + // mul v29.4S, v23.4S, v16.4S // ...............................................................*............................................................................................ + // trn2 v11.4S, v27.4S, v19.4S // .................................................................*.......................................................................................... + // sqrdmulh v1.4S, v0.4S, v17.4S // .......................................................*.................................................................................................... + // trn2 v30.4S, v25.4S, v20.4S // ...................................................................*........................................................................................ + // sqrdmulh v26.4S, v23.4S, v18.4S // ........................................................................*................................................................................... + // trn1 v19.4S, v25.4S, v20.4S // .....................................................................*...................................................................................... + // trn1 v0.2D, v11.2D, v30.2D // .........................................................................*.................................................................................. + // sqrdmulh v28.4S, v5.4S, v7.4S // ...........................................................................*................................................................................ + // trn2 v30.2D, v11.2D, v30.2D // ..........................................................................*................................................................................. + // mul v5.4S, v5.4S, v22.4S // ..................................................................*......................................................................................... + // trn2 v18.2D, v9.2D, v19.2D // .............................................................................*.............................................................................. + // trn1 v4.2D, v9.2D, v19.2D // ............................................................................*............................................................................... + // mls v29.4S, v26.4S, v8.S[0] // ................................................................................*........................................................................... + // add v11.4S, v18.4S, v30.4S // .................................................................................*.......................................................................... + // mls v14.4S, v1.4S, v8.S[0] // ..............................................................................*............................................................................. + // add v20.4S, v4.4S, v0.4S // ...............................................................................*............................................................................ + // sub v18.4S, v18.4S, v30.4S // ..................................................................................................*......................................................... + // mls v5.4S, v28.4S, v8.S[0] // ...................................................................................*........................................................................ + // add v26.4S, v21.4S, v2.4S // ....................................................*....................................................................................................... + // sub v2.4S, v20.4S, v11.4S // .....................................................................................*...................................................................... + // mul v25.4S, v18.4S, v24.S[0] // ...............................................................................................................*............................................ + // sub v1.4S, v14.4S, v29.4S // ........................................................................................*................................................................... + // add v9.4S, v14.4S, v29.4S // .........................................................................................*.................................................................. + // sqrdmulh v21.4S, v2.4S, v6.S[3] // ...............................................................................................*............................................................ + // sub v0.4S, v4.4S, v0.4S // ..................................................................................*......................................................................... + // add v4.4S, v20.4S, v11.4S // ......................................................................................*..................................................................... + // sqrdmulh v10.4S, v1.4S, v7.4S // ............................................................................................*............................................................... + // sqrdmulh v29.4S, v18.4S, v24.S[1] // .........................................................................................................*.................................................. + // mul v23.4S, v1.4S, v22.4S // ................................................................................................*........................................................... + // trn2 v27.4S, v26.4S, v9.4S // .............................................................................................*.............................................................. + // mls v23.4S, v10.4S, v8.S[0] // .................................................................................................*.......................................................... + // srshr v19.4S, v4.4S, #23 // ...........................................................................................*................................................................ + // sqrdmulh v17.4S, v0.4S, v3.S[3] // .......................................................................................*.................................................................... + // trn1 v10.4S, v26.4S, v9.4S // ..............................................................................................*............................................................. + // mls v4.4S, v19.4S, v8.4S // ............................................................................................................*............................................... + // trn1 v18.4S, v5.4S, v23.4S // ......................................................................................................*..................................................... + // mls v25.4S, v29.4S, v8.S[0] // .................................................................................................................*.......................................... + // trn2 v29.4S, v5.4S, v23.4S // .......................................................................................................*.................................................... + // mul v13.4S, v2.4S, v6.S[2] // ...................................................................................................*........................................................ + // trn2 v30.2D, v10.2D, v18.2D // ...........................................................................................................*................................................ + // trn2 v16.2D, v27.2D, v29.2D // .............................................................................................................*.............................................. + // mul v2.4S, v0.4S, v3.S[2] // ..........................................................................................*................................................................. + // trn1 v7.2D, v10.2D, v18.2D // ..........................................................................................................*................................................. + // trn1 v28.2D, v27.2D, v29.2D // ..............................................................................................................*............................................. + // mls v2.4S, v17.4S, v8.S[0] // ....................................................................................................*....................................................... + // sub v10.4S, v30.4S, v16.4S // ................................................................................................................*........................................... + // add v19.4S, v30.4S, v16.4S // ..................................................................................................................*......................................... + // mls v13.4S, v21.4S, v8.S[0] // ........................................................................................................*................................................... + // add v5.4S, v7.4S, v28.4S // ...................................................................................................................*........................................ + // sub v14.4S, v7.4S, v28.4S // ....................................................................................................................*....................................... + // sqrdmulh v23.4S, v10.4S, v12.S[1] // .....................................................................................................................*...................................... + // sub v30.4S, v2.4S, v25.4S // .........................................................................................................................................*.................. + // mul v10.4S, v10.4S, v12.S[0] // ......................................................................................................................*..................................... + // add v15.4S, v5.4S, v19.4S // .........................................................................................................................*.................................. + // sqrdmulh v9.4S, v30.4S, v6.S[3] // ...........................................................................................................................................*................ + // mul v27.4S, v30.4S, v6.S[2] // ....................................................................................................................................................*....... + // srshr v29.4S, v15.4S, #23 // .............................................................................................................................*.............................. + // sqrdmulh v0.4S, v14.4S, v24.S[3] // ...........................................................................................................................*................................ + // mls v27.4S, v9.4S, v8.S[0] // ......................................................................................................................................................*..... + // mul v11.4S, v14.4S, v24.S[2] // ..........................................................................................................................*................................. + // sub v9.4S, v5.4S, v19.4S // .......................................................................................................................*.................................... + // mls v15.4S, v29.4S, v8.4S // ..................................................................................................................................*......................... + // mls v11.4S, v0.4S, v8.S[0] // ................................................................................................................................*........................... + // add v19.4S, v2.4S, v25.4S // ........................................................................................................................*................................... + // mls v10.4S, v23.4S, v8.S[0] // ..............................................................................................................................*............................. + // sub v5.4S, v4.4S, v15.4S // .................................................................................................................................................*.......... + // add v29.4S, v4.4S, v15.4S // ...............................................................................................................................................*............ + // sqrdmulh v1.4S, v9.4S, v3.S[1] // ...............................................................................................................................*............................ + // srshr v25.4S, v19.4S, #23 // ............................................................................................................................*............................... + // mul v16.4S, v9.4S, v3.S[0] // ....................................................................................................................................*....................... + // str q29, [x1], #(16*4) // .....................................................................................................................................................*...... + // sub v14.4S, v11.4S, v10.4S // ...................................................................................................................................*........................ + // add v29.4S, v11.4S, v10.4S // .....................................................................................................................................*...................... + // mls v19.4S, v25.4S, v8.4S // .................................................................................................................................*.......................... + // sqrdmulh v10.4S, v14.4S, v3.S[1] // ......................................................................................................................................*..................... + // srshr v18.4S, v29.4S, #23 // .......................................................................................................................................*.................... + // mls v16.4S, v1.4S, v8.S[0] // ........................................................................................................................................*................... + // mls v29.4S, v18.4S, v8.4S // ..........................................................................................................................................*................. + // mul v18.4S, v14.4S, v3.S[0] // .............................................................................................................................................*.............. + // sub v0.4S, v13.4S, v16.4S // ..............................................................................................................................................*............. + // add v26.4S, v13.4S, v16.4S // ............................................................................................................................................*............... + // mls v18.4S, v10.4S, v8.S[0] // ................................................................................................................................................*........... + // sub v2.4S, v19.4S, v29.4S // ........................................................................................................................................................*... + // add v29.4S, v19.4S, v29.4S // ...................................................................................................................................................*........ + // sqrdmulh v19.4S, v5.4S, v6.S[1] // .........................................................................................................................................................*.. + // str q26, [x1, #-32] // ..................................................................................................................................................*......... + // mul v13.4S, v2.4S, v6.S[0] // ..........................................................................................................................................................*. + // str q29, [x1, #-48] // .......................................................................................................................................................*.... + // add v29.4S, v27.4S, v18.4S // ...........................................................................................................................................................* sub count, count, #1 layer45678_start: - ldr q3, [x5, #224] // ..........................e........................................................................................................................................... - add v17.4S, v24.4S, v6.4S // ...............................*...................................................................................................................................... - ldr q21, [x2, #0] // ............*......................................................................................................................................................... - ldr q14, [x2, #16] // .............*........................................................................................................................................................ - mls v2.4S, v28.4S, v8.S[0] // ..................................*................................................................................................................................... - ldr q24, [x5, #16] // .........................*............................................................................................................................................ - ldr q28, [x2, #32] // ..............*....................................................................................................................................................... - ldr q6, [x5, #96] // ..................................................*................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v31.4S, v26.4S, v30.4S // .....................................*................................................................................................................................ - ldr q26, [x2, #48] // ...............*...................................................................................................................................................... - sub v1.4S, v17.4S, v15.4S // ........................................*............................................................................................................................. - add v17.4S, v17.4S, v15.4S // .........................................*............................................................................................................................ - ldr q15, [x5], #(12*16) // ........................*............................................................................................................................................. - ldr q29, [x5, #-80] // ...................................................*.................................................................................................................. - mls v31.4S, v13.4S, v8.S[0] // .......................................*.............................................................................................................................. - trn1 v13.4S, v21.4S, v14.4S // ................*..................................................................................................................................................... - ldr q16, [x5, #-64] // ....................................................*................................................................................................................. - trn2 v21.4S, v21.4S, v14.4S // .................*.................................................................................................................................................... - ldr q14, [x5, #-16] // .......................................................*.............................................................................................................. - ldr q9, [x5, #-48] // .....................................................*................................................................................................................ - sqrdmulh v0.4S, v1.4S, v24.4S // ...........................................*.......................................................................................................................... - trn1 v30.4S, v28.4S, v26.4S // ..................*................................................................................................................................................... - ldr q7, [x5, #-32] // ......................................................*............................................................................................................... - trn2 v28.4S, v28.4S, v26.4S // ...................*.................................................................................................................................................. - ldr q26, [x4], #64 // ............................................................................................*......................................................................... - // gap // ...................................................................................................................................................................... - mul v1.4S, v1.4S, v15.4S // ..........................................*........................................................................................................................... - ldr q18, [x4, #-48] // .............................................................................................*........................................................................ - // gap // ...................................................................................................................................................................... - sub v25.4S, v2.4S, v31.4S // .............................................*........................................................................................................................ - ldr q12, [x4, #-32] // ..............................................................................................*....................................................................... - // gap // ...................................................................................................................................................................... - add v2.4S, v2.4S, v31.4S // ..............................................*....................................................................................................................... - trn2 v31.2D, v13.2D, v30.2D // ....................*................................................................................................................................................. - ldr q23, [x4, #-16] // ...............................................................................................*...................................................................... - mls v1.4S, v0.4S, v8.S[0] // ............................................*......................................................................................................................... - trn2 v0.2D, v21.2D, v28.2D // .....................*................................................................................................................................................ - ldr q10, [x5, #48] // ...........................e.......................................................................................................................................... - trn1 v13.2D, v13.2D, v30.2D // ......................*............................................................................................................................................... - ldr q30, [x5, #64] // ............................e......................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v24.4S, v25.4S, v24.4S // ................................................*..................................................................................................................... - trn1 v4.4S, v17.4S, v2.4S // ............................................................................*......................................................................................... - ldr q22, [x5, #80] // .............................e........................................................................................................................................ - sub v11.4S, v31.4S, v0.4S // .............................................................*........................................................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn2 v17.4S, v17.4S, v2.4S // .............................................................................*........................................................................................ - mul v2.4S, v25.4S, v15.4S // ...............................................*...................................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v21.2D, v21.2D, v28.2D // .......................*.............................................................................................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v28.4S, v11.4S, v7.4S // ...............................................................*...................................................................................................... - add v31.4S, v31.4S, v0.4S // ..............................................................*....................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v2.4S, v24.4S, v8.S[0] // .................................................*.................................................................................................................... - sub v24.4S, v13.4S, v21.4S // ........................................................*............................................................................................................. - // gap // ...................................................................................................................................................................... - add v21.4S, v13.4S, v21.4S // .........................................................*............................................................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v14.4S, v11.4S, v14.4S // ................................................................*..................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v15.4S, v24.4S, v9.4S // ...........................................................*.......................................................................................................... - sub v13.4S, v21.4S, v31.4S // ..................................................................*................................................................................................... - // gap // ...................................................................................................................................................................... - add v21.4S, v21.4S, v31.4S // ...................................................................*.................................................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v24.4S, v24.4S, v16.4S // ..........................................................*........................................................................................................... - trn2 v31.4S, v1.4S, v2.4S // ...............................................................................*...................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v2.4S, v1.4S, v2.4S // ..............................................................................*....................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v28.4S, v14.4S, v8.S[0] // .................................................................*.................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn2 v14.2D, v17.2D, v31.2D // .................................................................................*.................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v24.4S, v15.4S, v8.S[0] // ............................................................*......................................................................................................... - trn2 v1.2D, v4.2D, v2.2D // ................................................................................*..................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v2.2D, v4.2D, v2.2D // ..................................................................................*................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v17.2D, v17.2D, v31.2D // ...................................................................................*.................................................................................. - mul v31.4S, v13.4S, v6.4S // ....................................................................*................................................................................................. - // gap // ...................................................................................................................................................................... - sub v15.4S, v1.4S, v14.4S // .....................................................................................................*................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v13.4S, v13.4S, v29.4S // .....................................................................*................................................................................................ - add v14.4S, v1.4S, v14.4S // ......................................................................................................*............................................................... - // gap // ...................................................................................................................................................................... - sub v1.4S, v24.4S, v28.4S // .......................................................................*.............................................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - add v24.4S, v24.4S, v28.4S // ........................................................................*............................................................................................. - mul v28.4S, v15.4S, v12.S[0] // .......................................................................................................*.............................................................. - // gap // ...................................................................................................................................................................... - sub v16.4S, v2.4S, v17.4S // ................................................................................................*..................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v6.4S, v1.4S, v6.4S // .........................................................................*............................................................................................ - add v17.4S, v2.4S, v17.4S // .................................................................................................*.................................................................... - // gap // ...................................................................................................................................................................... - trn1 v2.4S, v21.4S, v24.4S // ....................................................................................*................................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v1.4S, v1.4S, v29.4S // ..........................................................................*........................................................................................... - trn2 v21.4S, v21.4S, v24.4S // .....................................................................................*................................................................................ - // gap // ...................................................................................................................................................................... - sub v24.4S, v17.4S, v14.4S // ....................................................................................................................*................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v31.4S, v13.4S, v8.S[0] // ......................................................................*............................................................................................... - add v17.4S, v17.4S, v14.4S // .....................................................................................................................*................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v14.4S, v16.4S, v18.S[3] // ...................................................................................................*.................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v6.4S, v1.4S, v8.S[0] // ...........................................................................*.......................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v1.4S, v16.4S, v18.S[2] // ..................................................................................................*................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v1.4S, v14.4S, v8.S[0] // ....................................................................................................*................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v14.4S, v31.4S, v6.4S // ......................................................................................*............................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v15.4S, v15.4S, v12.S[1] // ........................................................................................................*............................................................. - trn2 v6.4S, v31.4S, v6.4S // .......................................................................................*.............................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v31.4S, v24.4S, v26.S[3] // .......................................................................................................................*.............................................. - trn2 v29.2D, v2.2D, v14.2D // ........................................................................................*............................................................................. - // gap // ...................................................................................................................................................................... - trn1 v14.2D, v2.2D, v14.2D // ..........................................................................................*........................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v2.4S, v24.4S, v26.S[2] // ......................................................................................................................*............................................... - trn1 v24.2D, v21.2D, v6.2D // ...........................................................................................*.......................................................................... - // gap // ...................................................................................................................................................................... - trn2 v21.2D, v21.2D, v6.2D // .........................................................................................*............................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v28.4S, v15.4S, v8.S[0] // .........................................................................................................*............................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v6.4S, v14.4S, v24.4S // ..........................................................................................................*........................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v2.4S, v31.4S, v8.S[0] // ........................................................................................................................*............................................. - add v14.4S, v14.4S, v24.4S // ...........................................................................................................*.......................................................... - // gap // ...................................................................................................................................................................... - sub v24.4S, v29.4S, v21.4S // ...............................................................................................................*...................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - add v21.4S, v29.4S, v21.4S // ................................................................................................................*..................................................... - mul v31.4S, v6.4S, v12.S[2] // ............................................................................................................*......................................................... - // gap // ...................................................................................................................................................................... - sub v15.4S, v1.4S, v28.4S // .........................................................................................................................*............................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v6.4S, v6.4S, v12.S[3] // .............................................................................................................*........................................................ - add v28.4S, v1.4S, v28.4S // ..........................................................................................................................*........................................... - // gap // ...................................................................................................................................................................... - sub v1.4S, v14.4S, v21.4S // ..............................................................................................................................*....................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - add v21.4S, v14.4S, v21.4S // ...............................................................................................................................*...................................... - sqrdmulh v14.4S, v24.4S, v23.S[1] // ..................................................................................................................*................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v24.4S, v24.4S, v23.S[0] // .................................................................................................................*.................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v29.4S, v17.4S, v21.4S // ........................................................................................................................................*............................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - add v17.4S, v17.4S, v21.4S // .........................................................................................................................................*............................ - mls v31.4S, v6.4S, v8.S[0] // ..............................................................................................................*....................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v24.4S, v14.4S, v8.S[0] // ...................................................................................................................*.................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - str q17, [x1], #(16*4) // ............................................................................................................................................................*......... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v17.4S, v15.4S, v26.S[2] // ...........................................................................................................................*.......................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v21.4S, v1.4S, v18.S[0] // ................................................................................................................................*..................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v14.4S, v31.4S, v24.4S // ...................................................................................................................................*.................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v6.4S, v1.4S, v18.S[1] // .................................................................................................................................*.................................... - add v24.4S, v31.4S, v24.4S // ....................................................................................................................................*................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v31.4S, v14.4S, v18.S[0] // .....................................................................................................................................*................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v1.4S, v28.4S, v24.4S // .............................................................................................................................................*........................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v14.4S, v14.4S, v18.S[1] // ......................................................................................................................................*............................... - add v24.4S, v28.4S, v24.4S // ..............................................................................................................................................*....................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v28.4S, v15.4S, v26.S[3] // ............................................................................................................................*......................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - str q24, [x1, #-48] // .............................................................................................................................................................*........ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v21.4S, v6.4S, v8.S[0] // ..................................................................................................................................*................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v31.4S, v14.4S, v8.S[0] // .......................................................................................................................................*.............................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v17.4S, v28.4S, v8.S[0] // .............................................................................................................................*........................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v14.4S, v2.4S, v21.4S // ..................................................................................................................................................*................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - add v21.4S, v2.4S, v21.4S // ...................................................................................................................................................*.................. - mul v2.4S, v29.4S, v26.S[0] // ..........................................................................................................................................*........................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v24.4S, v29.4S, v26.S[1] // ...........................................................................................................................................*.......................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - str q21, [x1, #-32] // ..............................................................................................................................................................*....... - add v21.4S, v17.4S, v31.4S // ........................................................................................................................................................*............. - // gap // ...................................................................................................................................................................... - sub v17.4S, v17.4S, v31.4S // .......................................................................................................................................................*.............. - mul v28.4S, v1.4S, v26.S[0] // ...............................................................................................................................................*...................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v6.4S, v1.4S, v26.S[1] // ................................................................................................................................................*..................... - str q21, [x1, #-16] // ...............................................................................................................................................................*...... - add x1, x1, #64 // ....................................................................................................................................................................*. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q21, [x1, #0] // e..................................................................................................................................................................... - ldr q31, [x1, #16] // .e.................................................................................................................................................................... - mul v1.4S, v14.4S, v26.S[0] // ....................................................................................................................................................*................. - ldr q15, [x1, #32] // ..e................................................................................................................................................................... - ldr q29, [x1, #48] // ...e.................................................................................................................................................................. - // gap // ...................................................................................................................................................................... - sqrdmulh v14.4S, v14.4S, v26.S[1] // .....................................................................................................................................................*................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v13.4S, v21.4S, v31.4S // ....e................................................................................................................................................................. - mul v16.4S, v17.4S, v26.S[0] // .........................................................................................................................................................*............ - // gap // ...................................................................................................................................................................... - trn2 v21.4S, v21.4S, v31.4S // .....e................................................................................................................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v17.4S, v17.4S, v26.S[1] // ..........................................................................................................................................................*........... - trn1 v31.4S, v15.4S, v29.4S // ......e............................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn2 v26.4S, v15.4S, v29.4S // .......e.............................................................................................................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v2.4S, v24.4S, v8.S[0] // ............................................................................................................................................*......................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v24.2D, v13.2D, v31.2D // ..........e........................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v28.4S, v6.4S, v8.S[0] // .................................................................................................................................................*.................... - trn1 v6.2D, v21.2D, v26.2D // ...........e.......................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn2 v31.2D, v13.2D, v31.2D // ........e............................................................................................................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v1.4S, v14.4S, v8.S[0] // ......................................................................................................................................................*............... - trn2 v21.2D, v21.2D, v26.2D // .........e............................................................................................................................................................ - // gap // ...................................................................................................................................................................... - str q2, [x2], #(16*4) // ................................................................................................................................................................*..... - sub v14.4S, v24.4S, v6.4S // ..............................e....................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v16.4S, v17.4S, v8.S[0] // ...........................................................................................................................................................*.......... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - str q28, [x2, #-48] // .................................................................................................................................................................*.... - sub v26.4S, v31.4S, v21.4S // ...................................e.................................................................................................................................. - // gap // ...................................................................................................................................................................... - sqrdmulh v28.4S, v14.4S, v10.4S // .................................e.................................................................................................................................... - add v15.4S, v31.4S, v21.4S // ....................................e................................................................................................................................. - // gap // ...................................................................................................................................................................... - str q1, [x2, #-32] // ..................................................................................................................................................................*... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v2.4S, v14.4S, v3.4S // ................................e..................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - str q16, [x2, #-16] // ...................................................................................................................................................................*.. - add x2, x2, #64 // .....................................................................................................................................................................* - // gap // ...................................................................................................................................................................... - sqrdmulh v13.4S, v26.4S, v22.4S // ......................................e............................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... + ldr q16, [x5, #160] // ......................................................e....................................................................................................................... + sqrdmulh v9.4S, v0.4S, v6.S[1] // .............................................................................................................................................................*................ + ldr q7, [x5, #112] // ...................................................e.......................................................................................................................... + ldr q25, [x5, #32] // ..........................e................................................................................................................................................... + // gap // .............................................................................................................................................................................. + ldr q12, [x4, #48] // ...............................................................................................e.............................................................................. + mul v15.4S, v5.4S, v6.S[0] // ..................................................................................................................................................*........................... + str q29, [x1, #-16] // .......................................................................................................................................................................*...... + add x1, x1, #64 // ............................................................................................................................................................................*. + ldr q24, [x4, #32] // ..............................................................................................e............................................................................... + sub v5.4S, v27.4S, v18.4S // ...............................................................................................................................................................*.............. + ldr q3, [x4, #16] // .............................................................................................e................................................................................ + ldr q17, [x1, #48] // ...e.......................................................................................................................................................................... + mls v15.4S, v19.4S, v8.S[0] // ....................................................................................................................................................*......................... + ldr q11, [x1, #32] // ..e........................................................................................................................................................................... + ldr q26, [x1, #16] // .e............................................................................................................................................................................ + ldr q21, [x1, #0] // e............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v2.4S, v2.4S, v6.S[1] // ........................................................................................................................................................*..................... + ldr q31, [x5, #128] // ....................................................e......................................................................................................................... + // gap // .............................................................................................................................................................................. + ldr q18, [x5, #176] // .......................................................e...................................................................................................................... + ldr q20, [x5, #80] // .............................e................................................................................................................................................ + // gap // .............................................................................................................................................................................. + mul v19.4S, v5.4S, v6.S[0] // .................................................................................................................................................................*............ + ldr q22, [x5, #96] // ..................................................e........................................................................................................................... + trn1 v14.4S, v11.4S, v17.4S // ......e....................................................................................................................................................................... + ldr q27, [x5, #48] // ...........................e.................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v30.4S, v21.4S, v26.4S // ....e......................................................................................................................................................................... + sqrdmulh v29.4S, v5.4S, v6.S[1] // ..................................................................................................................................................................*........... + trn2 v23.4S, v21.4S, v26.4S // .....e........................................................................................................................................................................ + ldr q4, [x5], #(12*16) // ........................e..................................................................................................................................................... + trn2 v28.4S, v11.4S, v17.4S // .......e...................................................................................................................................................................... + ldr q17, [x5, #-48] // .....................................................e........................................................................................................................ + // gap // .............................................................................................................................................................................. + mul v5.4S, v0.4S, v6.S[0] // ............................................................................................................................................................*................. + ldr q6, [x4], #64 // ............................................................................................e................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v26.2D, v30.2D, v14.2D // ........e..................................................................................................................................................................... + ldr q10, [x5, #-128] // ............................e................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v1.2D, v23.2D, v28.2D // .........e.................................................................................................................................................................... + mls v13.4S, v2.4S, v8.S[0] // .........................................................................................................................................................*.................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v0.2D, v23.2D, v28.2D // ...........e.................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v19.4S, v29.4S, v8.S[0] // ...................................................................................................................................................................*.......... + trn1 v23.2D, v30.2D, v14.2D // ..........e................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v29.4S, v26.4S, v1.4S // ...................................e.......................................................................................................................................... + // gap // .............................................................................................................................................................................. + mls v5.4S, v9.4S, v8.S[0] // ..............................................................................................................................................................*............... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v21.4S, v23.4S, v0.4S // ..............................e............................................................................................................................................... + add v14.4S, v26.4S, v1.4S // ....................................e......................................................................................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v9.4S, v29.4S, v20.4S // ......................................e....................................................................................................................................... + str q15, [x2], #(16*4) // ........................................................................................................................................................................*..... + str q13, [x2, #-48] // .........................................................................................................................................................................*.... + add v30.4S, v23.4S, v0.4S // ...............................e.............................................................................................................................................. + sqrdmulh v23.4S, v21.4S, v27.4S // .................................e............................................................................................................................................ + str q19, [x2, #-16] // ...........................................................................................................................................................................*.. + ldr q13, [x5, #-176] // .........................e.................................................................................................................................................... + str q5, [x2, #-32] // ..........................................................................................................................................................................*... + add x2, x2, #64 // .............................................................................................................................................................................* + // gap // .............................................................................................................................................................................. + mul v15.4S, v29.4S, v10.4S // .....................................e........................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v29.4S, v30.4S, v14.4S // ........................................e..................................................................................................................................... + ldr q19, [x2, #48] // ...............e.............................................................................................................................................................. + ldr q5, [x2, #32] // ..............e............................................................................................................................................................... + mul v28.4S, v21.4S, v25.4S // ................................e............................................................................................................................................. + ldr q1, [x2, #16] // .............e................................................................................................................................................................ + ldr q0, [x2, #0] // ............e................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v28.4S, v23.4S, v8.S[0] // ..................................e........................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v11.4S, v5.4S, v19.4S // ..................e........................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v15.4S, v9.4S, v8.S[0] // .......................................e...................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v9.4S, v0.4S, v1.4S // .................e............................................................................................................................................................ + trn2 v10.4S, v5.4S, v19.4S // ...................e.......................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v0.4S, v0.4S, v1.4S // ................e............................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v19.4S, v29.4S, v13.4S // ...........................................e.................................................................................................................................. + add v27.4S, v30.4S, v14.4S // .........................................e.................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v25.4S, v29.4S, v4.4S // ..........................................e................................................................................................................................... + trn1 v30.2D, v9.2D, v10.2D // .......................e...................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v21.2D, v0.2D, v11.2D // ......................e....................................................................................................................................................... + trn2 v26.2D, v9.2D, v10.2D // .....................e........................................................................................................................................................ + // gap // .............................................................................................................................................................................. + sub v5.4S, v28.4S, v15.4S // .............................................e................................................................................................................................ + trn2 v2.2D, v0.2D, v11.2D // ....................e......................................................................................................................................................... + mls v25.4S, v19.4S, v8.S[0] // ............................................e................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v0.4S, v21.4S, v30.4S // ........................................................e..................................................................................................................... + add v21.4S, v21.4S, v30.4S // .........................................................e.................................................................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v29.4S, v5.4S, v13.4S // ................................................e............................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v23.4S, v2.4S, v26.4S // .............................................................e................................................................................................................ + add v2.4S, v2.4S, v26.4S // ..............................................................e............................................................................................................... + // gap // .............................................................................................................................................................................. + mul v14.4S, v0.4S, v31.4S // ..........................................................e................................................................................................................... + add v19.4S, v28.4S, v15.4S // ..............................................e............................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v20.4S, v5.4S, v4.4S // ...............................................e.............................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v5.4S, v21.4S, v2.4S // ..................................................................e........................................................................................................... + mls v20.4S, v29.4S, v8.S[0] // .................................................e............................................................................................................................ + // gap // .............................................................................................................................................................................. + trn1 v9.4S, v27.4S, v19.4S // ............................................................................e................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v29.4S, v23.4S, v16.4S // ...............................................................e.............................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v11.4S, v27.4S, v19.4S // .............................................................................e................................................................................................ + sqrdmulh v1.4S, v0.4S, v17.4S // ...........................................................e.................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v30.4S, v25.4S, v20.4S // ...............................................................................e.............................................................................................. + sqrdmulh v26.4S, v23.4S, v18.4S // ................................................................e............................................................................................................. + trn1 v19.4S, v25.4S, v20.4S // ..............................................................................e............................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v0.2D, v11.2D, v30.2D // ...................................................................................e.......................................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v28.4S, v5.4S, v7.4S // .....................................................................e........................................................................................................ + trn2 v30.2D, v11.2D, v30.2D // .................................................................................e............................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v5.4S, v5.4S, v22.4S // ....................................................................e......................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v18.2D, v9.2D, v19.2D // ................................................................................e............................................................................................. + trn1 v4.2D, v9.2D, v19.2D // ..................................................................................e........................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v29.4S, v26.4S, v8.S[0] // .................................................................e............................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v11.4S, v18.4S, v30.4S // ......................................................................................................e....................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v14.4S, v1.4S, v8.S[0] // ............................................................e................................................................................................................. + // gap // .............................................................................................................................................................................. + add v20.4S, v4.4S, v0.4S // .................................................................................................e............................................................................ + sub v18.4S, v18.4S, v30.4S // .....................................................................................................e........................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v5.4S, v28.4S, v8.S[0] // ......................................................................e....................................................................................................... + add v26.4S, v21.4S, v2.4S // ...................................................................e.......................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v2.4S, v20.4S, v11.4S // ....................................................................................................................e......................................................... + mul v25.4S, v18.4S, v24.S[0] // .......................................................................................................e...................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v1.4S, v14.4S, v29.4S // .......................................................................e...................................................................................................... + add v9.4S, v14.4S, v29.4S // ........................................................................e..................................................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v21.4S, v2.4S, v6.S[3] // .......................................................................................................................e...................................................... + sub v0.4S, v4.4S, v0.4S // ................................................................................................e............................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v4.4S, v20.4S, v11.4S // .....................................................................................................................e........................................................ + // gap // .............................................................................................................................................................................. + sqrdmulh v10.4S, v1.4S, v7.4S // ..........................................................................e................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v29.4S, v18.4S, v24.S[1] // ........................................................................................................e..................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v23.4S, v1.4S, v22.4S // .........................................................................e.................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v27.4S, v26.4S, v9.4S // .....................................................................................e........................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v23.4S, v10.4S, v8.S[0] // ...........................................................................e.................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + srshr v19.4S, v4.4S, #23 // ........................................................................................................................................e..................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v17.4S, v0.4S, v3.S[3] // ...................................................................................................e.......................................................................... + trn1 v10.4S, v26.4S, v9.4S // ....................................................................................e......................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v4.4S, v19.4S, v8.4S // .........................................................................................................................................e.................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v18.4S, v5.4S, v23.4S // ......................................................................................e....................................................................................... + mls v25.4S, v29.4S, v8.S[0] // .........................................................................................................e.................................................................... + trn2 v29.4S, v5.4S, v23.4S // .......................................................................................e...................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v13.4S, v2.4S, v6.S[2] // ......................................................................................................................e....................................................... + // gap // .............................................................................................................................................................................. + trn2 v30.2D, v10.2D, v18.2D // ........................................................................................e..................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v16.2D, v27.2D, v29.2D // .........................................................................................e.................................................................................... + mul v2.4S, v0.4S, v3.S[2] // ..................................................................................................e........................................................................... + trn1 v7.2D, v10.2D, v18.2D // ..........................................................................................e................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v28.2D, v27.2D, v29.2D // ...........................................................................................e.................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v2.4S, v17.4S, v8.S[0] // ....................................................................................................e......................................................................... + sub v10.4S, v30.4S, v16.4S // ...............................................................................................................e.............................................................. + // gap // .............................................................................................................................................................................. + add v19.4S, v30.4S, v16.4S // ................................................................................................................e............................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v13.4S, v21.4S, v8.S[0] // ........................................................................................................................e..................................................... + add v5.4S, v7.4S, v28.4S // ...........................................................................................................e.................................................................. + // gap // .............................................................................................................................................................................. + sub v14.4S, v7.4S, v28.4S // ..........................................................................................................e................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v23.4S, v10.4S, v12.S[1] // ..................................................................................................................e........................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v30.4S, v2.4S, v25.4S // .........................................................................................................................e.................................................... + mul v10.4S, v10.4S, v12.S[0] // .................................................................................................................e............................................................ + // gap // .............................................................................................................................................................................. + add v15.4S, v5.4S, v19.4S // ...............................................................................................................................e.............................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v9.4S, v30.4S, v6.S[3] // ............................................................................................................................e................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v27.4S, v30.4S, v6.S[2] // ...........................................................................................................................e.................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + srshr v29.4S, v15.4S, #23 // ............................................................................................................................................e................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v0.4S, v14.4S, v24.S[3] // .............................................................................................................e................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v27.4S, v9.4S, v8.S[0] // .............................................................................................................................e................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v11.4S, v14.4S, v24.S[2] // ............................................................................................................e................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v9.4S, v5.4S, v19.4S // ..............................................................................................................................e............................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v15.4S, v29.4S, v8.4S // .............................................................................................................................................e................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v11.4S, v0.4S, v8.S[0] // ..............................................................................................................e............................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v19.4S, v2.4S, v25.4S // ..........................................................................................................................e................................................... + mls v10.4S, v23.4S, v8.S[0] // ...................................................................................................................e.......................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v5.4S, v4.4S, v15.4S // ................................................................................................................................................e............................. + add v29.4S, v4.4S, v15.4S // .................................................................................................................................................e............................ + // gap // .............................................................................................................................................................................. + sqrdmulh v1.4S, v9.4S, v3.S[1] // .................................................................................................................................e............................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + srshr v25.4S, v19.4S, #23 // ..........................................................................................................................................e................................... + mul v16.4S, v9.4S, v3.S[0] // ................................................................................................................................e............................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q29, [x1], #(16*4) // ....................................................................................................................................................................e......... + // gap // .............................................................................................................................................................................. + sub v14.4S, v11.4S, v10.4S // ...................................................................................................................................e.......................................... + add v29.4S, v11.4S, v10.4S // ....................................................................................................................................e......................................... + mls v19.4S, v25.4S, v8.4S // ...........................................................................................................................................e.................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v10.4S, v14.4S, v3.S[1] // ......................................................................................................................................e....................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + srshr v18.4S, v29.4S, #23 // ..............................................................................................................................................e............................... + mls v16.4S, v1.4S, v8.S[0] // ..................................................................................................................................e........................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v29.4S, v18.4S, v8.4S // ...............................................................................................................................................e.............................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v18.4S, v14.4S, v3.S[0] // .....................................................................................................................................e........................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v0.4S, v13.4S, v16.4S // ..........................................................................................................................................................e................... + add v26.4S, v13.4S, v16.4S // ...........................................................................................................................................................e.................. + mls v18.4S, v10.4S, v8.S[0] // .......................................................................................................................................e...................................... + // gap // .............................................................................................................................................................................. + sub v2.4S, v19.4S, v29.4S // .....................................................................................................................................................e........................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v29.4S, v19.4S, v29.4S // ......................................................................................................................................................e....................... + // gap // .............................................................................................................................................................................. + sqrdmulh v19.4S, v5.4S, v6.S[1] // ...................................................................................................................................................e.......................... + str q26, [x1, #-32] // ......................................................................................................................................................................e....... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v13.4S, v2.4S, v6.S[0] // .......................................................................................................................................................e...................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q29, [x1, #-48] // .....................................................................................................................................................................e........ + // gap // .............................................................................................................................................................................. + add v29.4S, v27.4S, v18.4S // ................................................................................................................................................................e............. // original source code - // ldr q9, [x1, #0] // .......................................................................................................................................e..............................|......................................................................................................................................e............................. - // ldr q10, [x1, #16] // ........................................................................................................................................e.............................|.......................................................................................................................................e............................ - // ldr q11, [x1, #32] // ..........................................................................................................................................e...........................|.........................................................................................................................................e.......................... - // ldr q12, [x1, #48] // ...........................................................................................................................................e..........................|..........................................................................................................................................e......................... - // trn1 v25.4s, v9.4s, v10.4s // .............................................................................................................................................e........................|............................................................................................................................................e....................... - // trn2 v26.4s, v9.4s, v10.4s // ...............................................................................................................................................e......................|..............................................................................................................................................e..................... - // trn1 v27.4s, v11.4s, v12.4s // .................................................................................................................................................e....................|................................................................................................................................................e................... - // trn2 v28.4s, v11.4s, v12.4s // ..................................................................................................................................................e...................|.................................................................................................................................................e.................. - // trn2 v11.2d, v25.2d, v27.2d // .......................................................................................................................................................e..............|......................................................................................................................................................e............. - // trn2 v12.2d, v26.2d, v28.2d // .........................................................................................................................................................e............|........................................................................................................................................................e........... - // trn1 v9.2d, v25.2d, v27.2d // ....................................................................................................................................................e.................|...................................................................................................................................................e................ - // trn1 v10.2d, v26.2d, v28.2d // ......................................................................................................................................................e...............|.....................................................................................................................................................e.............. - // ldr q13, [x2, #0] // ..*...................................................................................................................................................................|.*.................................................................................................................................................................. - // ldr q14, [x2, #16] // ...*..................................................................................................................................................................|..*................................................................................................................................................................. - // ldr q15, [x2, #32] // ......*...............................................................................................................................................................|.....*.............................................................................................................................................................. - // ldr q16, [x2, #48] // .........*............................................................................................................................................................|........*........................................................................................................................................................... - // trn1 v25.4s, v13.4s, v14.4s // ...............*......................................................................................................................................................|..............*..................................................................................................................................................... - // trn2 v26.4s, v13.4s, v14.4s // .................*....................................................................................................................................................|................*................................................................................................................................................... - // trn1 v27.4s, v15.4s, v16.4s // .....................*................................................................................................................................................|....................*............................................................................................................................................... - // trn2 v28.4s, v15.4s, v16.4s // .......................*..............................................................................................................................................|......................*............................................................................................................................................. - // trn2 v15.2d, v25.2d, v27.2d // ..............................*.......................................................................................................................................|.............................*...................................................................................................................................... - // trn2 v16.2d, v26.2d, v28.2d // .................................*....................................................................................................................................|................................*................................................................................................................................... - // trn1 v13.2d, v25.2d, v27.2d // ...................................*..................................................................................................................................|..................................*................................................................................................................................. - // trn1 v14.2d, v26.2d, v28.2d // ...........................................*..........................................................................................................................|..........................................*......................................................................................................................... - // ldr q0, [x5], #(12*16) // ............*.........................................................................................................................................................|...........*........................................................................................................................................................ - // ldr q4, [x5, #(-12*16 + 1*16)] // .....*................................................................................................................................................................|....*............................................................................................................................................................... - // ldr q1, [x5, #(-12*16 + 2*16)] // e.....................................................................................................................................................................e.................................................................................................................................................................... - // ldr q5, [x5, #(-12*16 + 3*16)] // ..................................e...................................................................................................................................|.................................e.................................................................................................................................. - // ldr q2, [x5, #(-12*16 + 4*16)] // ....................................e.................................................................................................................................|...................................e................................................................................................................................ - // ldr q6, [x5, #(-12*16 + 5*16)] // .......................................e..............................................................................................................................|......................................e............................................................................................................................. - // sub v24.4s, v9.4s, v10.4s // ...........................................................................................................................................................e..........|..........................................................................................................................................................e......... - // add v9.4s, v9.4s, v10.4s // .*....................................................................................................................................................................|*................................................................................................................................................................... - // mul v10.4s, v24.4s, v1.4s // ..................................................................................................................................................................e...|.................................................................................................................................................................e.. - // sqrdmulh v24.4s, v24.4s, v5.4s // ...............................................................................................................................................................e......|..............................................................................................................................................................e..... - // mls v10.4s, v24.4s, v8.s[0] // ....*.................................................................................................................................................................|...*................................................................................................................................................................ - // sub v24.4s, v11.4s, v12.4s // ..............................................................................................................................................................e.......|.............................................................................................................................................................e...... - // add v11.4s, v11.4s, v12.4s // ................................................................................................................................................................e.....|...............................................................................................................................................................e.... - // mul v12.4s, v24.4s, v2.4s // ........*.............................................................................................................................................................|.......*............................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v6.4s // .....................................................................................................................................................................e|.................................................................................................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ..............*.......................................................................................................................................................|.............*...................................................................................................................................................... - // sub v24.4s, v9.4s, v11.4s // ..........*...........................................................................................................................................................|.........*.......................................................................................................................................................... - // add v9.4s, v9.4s, v11.4s // ...........*..........................................................................................................................................................|..........*......................................................................................................................................................... - // mul v11.4s, v24.4s, v0.4s // .........................*............................................................................................................................................|........................*........................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ....................*.................................................................................................................................................|...................*................................................................................................................................................ - // mls v11.4s, v24.4s, v8.s[0] // ................................*.....................................................................................................................................|...............................*.................................................................................................................................... - // sub v24.4s, v10.4s, v12.4s // ...........................*..........................................................................................................................................|..........................*......................................................................................................................................... - // add v10.4s, v10.4s, v12.4s // .............................*........................................................................................................................................|............................*....................................................................................................................................... - // mul v12.4s, v24.4s, v0.4s // ..........................................*...........................................................................................................................|.........................................*.......................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // .....................................*................................................................................................................................|....................................*............................................................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ..............................................*.......................................................................................................................|.............................................*...................................................................................................................... - // ldr q0, [x5, #(-12*16 + 6*16)] // .......*..............................................................................................................................................................|......*............................................................................................................................................................. - // ldr q4, [x5, #(-12*16 + 7*16)] // .............*........................................................................................................................................................|............*....................................................................................................................................................... - // ldr q1, [x5, #(-12*16 + 8*16)] // ................*.....................................................................................................................................................|...............*.................................................................................................................................................... - // ldr q5, [x5, #(-12*16 + 9*16)] // ...................*..................................................................................................................................................|..................*................................................................................................................................................. - // ldr q2, [x5, #(-12*16 + 10*16)] // ......................*...............................................................................................................................................|.....................*.............................................................................................................................................. - // ldr q6, [x5, #(-12*16 + 11*16)] // ..................*...................................................................................................................................................|.................*.................................................................................................................................................. - // sub v24.4s, v13.4s, v14.4s // ...............................................*......................................................................................................................|..............................................*..................................................................................................................... - // add v13.4s, v13.4s, v14.4s // ................................................*.....................................................................................................................|...............................................*.................................................................................................................... - // mul v14.4s, v24.4s, v1.4s // .....................................................*................................................................................................................|....................................................*............................................................................................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // ..................................................*...................................................................................................................|.................................................*.................................................................................................................. - // mls v14.4s, v24.4s, v8.s[0] // ..........................................................*...........................................................................................................|.........................................................*.......................................................................................................... - // sub v24.4s, v15.4s, v16.4s // ........................................*.............................................................................................................................|.......................................*............................................................................................................................ - // add v15.4s, v15.4s, v16.4s // .............................................*........................................................................................................................|............................................*....................................................................................................................... - // mul v16.4s, v24.4s, v2.4s // ............................................*.........................................................................................................................|...........................................*........................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v6.4s // .................................................*....................................................................................................................|................................................*................................................................................................................... - // mls v16.4s, v24.4s, v8.s[0] // ........................................................*.............................................................................................................|.......................................................*............................................................................................................ - // sub v24.4s, v13.4s, v15.4s // ...................................................*..................................................................................................................|..................................................*................................................................................................................. - // add v13.4s, v13.4s, v15.4s // ....................................................*.................................................................................................................|...................................................*................................................................................................................ - // mul v15.4s, v24.4s, v0.4s // ..............................................................*.......................................................................................................|.............................................................*...................................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ................................................................*.....................................................................................................|...............................................................*.................................................................................................... - // mls v15.4s, v24.4s, v8.s[0] // ............................................................................*.........................................................................................|...........................................................................*........................................................................................ - // sub v24.4s, v14.4s, v16.4s // ..................................................................*...................................................................................................|.................................................................*.................................................................................................. - // add v14.4s, v14.4s, v16.4s // ...................................................................*..................................................................................................|..................................................................*................................................................................................. - // mul v16.4s, v24.4s, v0.4s // ......................................................................*...............................................................................................|.....................................................................*.............................................................................................. - // sqrdmulh v24.4s, v24.4s, v4.4s // .........................................................................*............................................................................................|........................................................................*........................................................................................... - // mls v16.4s, v24.4s, v8.s[0] // ...............................................................................*......................................................................................|..............................................................................*..................................................................................... - // trn1 v25.4s, v9.4s, v10.4s // ......................................*...............................................................................................................................|.....................................*.............................................................................................................................. - // trn2 v26.4s, v9.4s, v10.4s // .........................................*............................................................................................................................|........................................*........................................................................................................................... - // trn1 v27.4s, v11.4s, v12.4s // .......................................................*..............................................................................................................|......................................................*............................................................................................................. - // trn2 v28.4s, v11.4s, v12.4s // ......................................................*...............................................................................................................|.....................................................*.............................................................................................................. - // trn2 v11.2d, v25.2d, v27.2d // ...........................................................*..........................................................................................................|..........................................................*......................................................................................................... - // trn2 v12.2d, v26.2d, v28.2d // .........................................................*............................................................................................................|........................................................*........................................................................................................... - // trn1 v9.2d, v25.2d, v27.2d // ............................................................*.........................................................................................................|...........................................................*........................................................................................................ - // trn1 v10.2d, v26.2d, v28.2d // .............................................................*........................................................................................................|............................................................*....................................................................................................... - // trn1 v25.4s, v13.4s, v14.4s // ........................................................................*.............................................................................................|.......................................................................*............................................................................................ - // trn2 v26.4s, v13.4s, v14.4s // ..........................................................................*...........................................................................................|.........................................................................*.......................................................................................... - // trn1 v27.4s, v15.4s, v16.4s // ..................................................................................*...................................................................................|.................................................................................*.................................................................................. - // trn2 v28.4s, v15.4s, v16.4s // ....................................................................................*.................................................................................|...................................................................................*................................................................................ - // trn2 v15.2d, v25.2d, v27.2d // ......................................................................................*...............................................................................|.....................................................................................*.............................................................................. - // trn2 v16.2d, v26.2d, v28.2d // ..........................................................................................*...........................................................................|.........................................................................................*.......................................................................... - // trn1 v13.2d, v25.2d, v27.2d // .......................................................................................*..............................................................................|......................................................................................*............................................................................. - // trn1 v14.2d, v26.2d, v28.2d // .........................................................................................*............................................................................|........................................................................................*........................................................................... - // ldr q0, [x4], #64 // ........................*.............................................................................................................................................|.......................*............................................................................................................................................ - // ldr q1, [x4, #(-64 + 16)] // ..........................*...........................................................................................................................................|.........................*.......................................................................................................................................... - // ldr q2, [x4, #(-64 + 32)] // ............................*.........................................................................................................................................|...........................*........................................................................................................................................ - // ldr q3, [x4, #(-64 + 48)] // ...............................*......................................................................................................................................|..............................*..................................................................................................................................... - // sub v24.4s, v9.4s, v10.4s // .....................................................................*................................................................................................|....................................................................*............................................................................................... - // add v9.4s, v9.4s, v10.4s // .......................................................................*..............................................................................................|......................................................................*............................................................................................. - // mul v10.4s, v24.4s, v1.s[2] // ................................................................................*.....................................................................................|...............................................................................*.................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..............................................................................*.......................................................................................|.............................................................................*...................................................................................... - // mls v10.4s, v24.4s, v8.s[0] // .................................................................................*....................................................................................|................................................................................*................................................................................... - // sub v24.4s, v11.4s, v12.4s // ...............................................................*......................................................................................................|..............................................................*..................................................................................................... - // add v11.4s, v11.4s, v12.4s // .................................................................*....................................................................................................|................................................................*................................................................................................... - // mul v12.4s, v24.4s, v2.s[0] // ....................................................................*.................................................................................................|...................................................................*................................................................................................ - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...................................................................................*..................................................................................|..................................................................................*................................................................................. - // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................................*..........................................................................|..........................................................................................*......................................................................... - // sub v24.4s, v13.4s, v14.4s // ............................................................................................*.........................................................................|...........................................................................................*........................................................................ - // add v13.4s, v13.4s, v14.4s // ..............................................................................................*.......................................................................|.............................................................................................*...................................................................... - // mul v14.4s, v24.4s, v2.s[2] // .................................................................................................*....................................................................|................................................................................................*................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...................................................................................................*..................................................................|..................................................................................................*................................................................. - // mls v14.4s, v24.4s, v8.s[0] // ...........................................................................................................*..........................................................|..........................................................................................................*......................................................... - // sub v24.4s, v15.4s, v16.4s // ...............................................................................................*......................................................................|..............................................................................................*..................................................................... - // add v15.4s, v15.4s, v16.4s // ................................................................................................*.....................................................................|...............................................................................................*.................................................................... - // mul v16.4s, v24.4s, v3.s[0] // ........................................................................................................*.............................................................|.......................................................................................................*............................................................ - // sqrdmulh v24.4s, v24.4s, v3.s[1] // .......................................................................................................*..............................................................|......................................................................................................*............................................................. - // mls v16.4s, v24.4s, v8.s[0] // ............................................................................................................*.........................................................|...........................................................................................................*........................................................ - // sub v24.4s, v9.4s, v11.4s // ...........................................................................*..........................................................................................|..........................................................................*......................................................................................... - // add v9.4s, v9.4s, v11.4s // .............................................................................*........................................................................................|............................................................................*....................................................................................... - // mul v11.4s, v24.4s, v0.s[2] // ........................................................................................*.............................................................................|.......................................................................................*............................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....................................................................................*................................................................................|....................................................................................*............................................................................... - // mls v11.4s, v24.4s, v8.s[0] // .............................................................................................*........................................................................|............................................................................................*....................................................................... - // sub v24.4s, v10.4s, v12.4s // ..................................................................................................*...................................................................|.................................................................................................*.................................................................. - // add v10.4s, v10.4s, v12.4s // ....................................................................................................*.................................................................|...................................................................................................*................................................................ - // mul v12.4s, v24.4s, v0.s[2] // ..............................................................................................................*.......................................................|.............................................................................................................*...................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................*..............................................|......................................................................................................................*............................................. - // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................................................................*..........................................|..........................................................................................................................*......................................... - // sub v24.4s, v13.4s, v15.4s // .....................................................................................................*................................................................|....................................................................................................*............................................................... - // add v13.4s, v13.4s, v15.4s // ......................................................................................................*...............................................................|.....................................................................................................*.............................................................. - // mul v15.4s, v24.4s, v1.s[0] // ...............................................................................................................*......................................................|..............................................................................................................*..................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .................................................................................................................*....................................................|................................................................................................................*................................................... - // mls v15.4s, v24.4s, v8.s[0] // .........................................................................................................................*............................................|........................................................................................................................*........................................... - // sub v24.4s, v14.4s, v16.4s // ................................................................................................................*.....................................................|...............................................................................................................*.................................................... - // add v14.4s, v14.4s, v16.4s // ..................................................................................................................*...................................................|.................................................................................................................*.................................................. - // mul v16.4s, v24.4s, v1.s[0] // ...................................................................................................................*..................................................|..................................................................................................................*................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .....................................................................................................................*................................................|....................................................................................................................*............................................... - // mls v16.4s, v24.4s, v8.s[0] // ..........................................................................................................................*...........................................|.........................................................................................................................*.......................................... - // sub v24.4s, v9.4s, v13.4s // .........................................................................................................*............................................................|........................................................................................................*........................................................... - // add v9.4s, v9.4s, v13.4s // ..........................................................................................................*...........................................................|.........................................................................................................*.......................................................... - // mul v13.4s, v24.4s, v0.s[0] // ..............................................................................................................................*.......................................|.............................................................................................................................*...................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................................................................................*......................................|..............................................................................................................................*..................................... - // mls v13.4s, v24.4s, v8.s[0] // ...................................................................................................................................................*..................|..................................................................................................................................................*................. - // sub v24.4s, v10.4s, v14.4s // ....................................................................................................................*.................................................|...................................................................................................................*................................................ - // add v10.4s, v10.4s, v14.4s // ......................................................................................................................*...............................................|.....................................................................................................................*.............................................. - // mul v14.4s, v24.4s, v0.s[0] // ...................................................................................................................................*..................................|..................................................................................................................................*................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ....................................................................................................................................*.................................|...................................................................................................................................*................................ - // mls v14.4s, v24.4s, v8.s[0] // .....................................................................................................................................................*................|....................................................................................................................................................*............... - // sub v24.4s, v11.4s, v15.4s // ............................................................................................................................*.........................................|...........................................................................................................................*........................................ - // add v11.4s, v11.4s, v15.4s // .............................................................................................................................*........................................|............................................................................................................................*....................................... - // mul v15.4s, v24.4s, v0.s[0] // .........................................................................................................................................*............................|........................................................................................................................................*........................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................*.........................|...........................................................................................................................................*........................ - // mls v15.4s, v24.4s, v8.s[0] // ........................................................................................................................................................*.............|.......................................................................................................................................................*............ - // sub v24.4s, v12.4s, v16.4s // ..................................................................................................................................*...................................|.................................................................................................................................*.................................. - // add v12.4s, v12.4s, v16.4s // .................................................................................................................................*....................................|................................................................................................................................*................................... - // mul v16.4s, v24.4s, v0.s[0] // ..............................................................................................................................................*.......................|.............................................................................................................................................*...................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................................................................................................................*.....................|...............................................................................................................................................*.................... - // mls v16.4s, v24.4s, v8.s[0] // ............................................................................................................................................................*.........|...........................................................................................................................................................*........ - // str q9, [x1], #(16*4) // .............................................................................................................*........................................................|............................................................................................................*....................................................... - // str q10, [x1, #(-16*4 + 1*16)] // ........................................................................................................................*.............................................|.......................................................................................................................*............................................ - // str q11, [x1, #(-16*4 + 2*16)] // ................................................................................................................................*.....................................|...............................................................................................................................*.................................... - // str q12, [x1, #(-16*4 + 3*16)] // .....................................................................................................................................*................................|....................................................................................................................................*............................... - // str q13, [x2], #(16*4) // ..........................................................................................................................................................*...........|.........................................................................................................................................................*.......... - // str q14, [x2, #(-16*4 + 1*16)] // .............................................................................................................................................................*........|............................................................................................................................................................*....... - // str q15, [x2, #(-16*4 + 2*16)] // .................................................................................................................................................................*....|................................................................................................................................................................*... - // str q16, [x2, #(-16*4 + 3*16)] // ...................................................................................................................................................................*..|..................................................................................................................................................................*. - // add x1, x1, #64 // ......................................................................................................................................*...............................|.....................................................................................................................................*.............................. - // add x2, x2, #64 // ....................................................................................................................................................................*.|...................................................................................................................................................................* + // ldr q9, [x1, #0] // ...............e..............................................................................................................................................................|..............e.................................... + // ldr q10, [x1, #16] // ..............e...............................................................................................................................................................|.............e..................................... + // ldr q11, [x1, #32] // .............e................................................................................................................................................................|............e...................................... + // ldr q12, [x1, #48] // ...........e..................................................................................................................................................................|..........e........................................ + // trn1 v25.4s, v9.4s, v10.4s // ........................e.....................................................................................................................................................|.......................e........................... + // trn2 v26.4s, v9.4s, v10.4s // ..........................e...................................................................................................................................................|.........................e......................... + // trn1 v27.4s, v11.4s, v12.4s // ......................e.......................................................................................................................................................|.....................e............................. + // trn2 v28.4s, v11.4s, v12.4s // ............................e.................................................................................................................................................|...........................e....................... + // trn2 v11.2d, v25.2d, v27.2d // ................................e.............................................................................................................................................|...............................e................... + // trn2 v12.2d, v26.2d, v28.2d // ..................................e...........................................................................................................................................|.................................e................. + // trn1 v9.2d, v25.2d, v27.2d // ......................................e.......................................................................................................................................|.....................................e............. + // trn1 v10.2d, v26.2d, v28.2d // ....................................e.........................................................................................................................................|...................................e............... + // ldr q13, [x2, #0] // ..........................................................e...................................................................................................................|................................................... + // ldr q14, [x2, #16] // .........................................................e....................................................................................................................|................................................... + // ldr q15, [x2, #32] // .......................................................e......................................................................................................................|................................................... + // ldr q16, [x2, #48] // ......................................................e.......................................................................................................................|................................................... + // trn1 v25.4s, v13.4s, v14.4s // ................................................................e.............................................................................................................|................................................... + // trn2 v26.4s, v13.4s, v14.4s // ..............................................................e...............................................................................................................|................................................... + // trn1 v27.4s, v15.4s, v16.4s // ............................................................e.................................................................................................................|................................................... + // trn2 v28.4s, v15.4s, v16.4s // ...............................................................e..............................................................................................................|................................................... + // trn2 v15.2d, v25.2d, v27.2d // ........................................................................e.....................................................................................................|................................................... + // trn2 v16.2d, v26.2d, v28.2d // ......................................................................e.......................................................................................................|................................................... + // trn1 v13.2d, v25.2d, v27.2d // .....................................................................e........................................................................................................|................................................... + // trn1 v14.2d, v26.2d, v28.2d // ....................................................................e.........................................................................................................|................................................... + // ldr q0, [x5], #(12*16) // ...........................e..................................................................................................................................................|..........................e........................ + // ldr q4, [x5, #(-12*16 + 1*16)] // .................................................e............................................................................................................................|................................................e.. + // ldr q1, [x5, #(-12*16 + 2*16)] // ...e..........................................................................................................................................................................|..e................................................ + // ldr q5, [x5, #(-12*16 + 3*16)] // .......................e......................................................................................................................................................|......................e............................ + // ldr q2, [x5, #(-12*16 + 4*16)] // .................................e............................................................................................................................................|................................e.................. + // ldr q6, [x5, #(-12*16 + 5*16)] // ...................e..........................................................................................................................................................|..................e................................ + // sub v24.4s, v9.4s, v10.4s // .........................................e....................................................................................................................................|........................................e.......... + // add v9.4s, v9.4s, v10.4s // ..............................................e...............................................................................................................................|.............................................e..... + // mul v10.4s, v24.4s, v1.4s // ........................................................e.....................................................................................................................|................................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ...............................................e..............................................................................................................................|..............................................e.... + // mls v10.4s, v24.4s, v8.s[0] // ...........................................................e..................................................................................................................|................................................... + // sub v24.4s, v11.4s, v12.4s // .......................................e......................................................................................................................................|......................................e............ + // add v11.4s, v11.4s, v12.4s // ..........................................e...................................................................................................................................|.........................................e......... + // mul v12.4s, v24.4s, v2.4s // ....................................................e.........................................................................................................................|................................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ...........................................e..................................................................................................................................|..........................................e........ + // mls v12.4s, v24.4s, v8.s[0] // .............................................................e................................................................................................................|................................................... + // sub v24.4s, v9.4s, v11.4s // .....................................................e........................................................................................................................|................................................... + // add v9.4s, v9.4s, v11.4s // ..................................................................e...........................................................................................................|................................................... + // mul v11.4s, v24.4s, v0.4s // ...................................................................e..........................................................................................................|................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // .................................................................e............................................................................................................|................................................... + // mls v11.4s, v24.4s, v8.s[0] // .........................................................................e....................................................................................................|................................................... + // sub v24.4s, v10.4s, v12.4s // .......................................................................e......................................................................................................|................................................... + // add v10.4s, v10.4s, v12.4s // ................................................................................e.............................................................................................|................................................... + // mul v12.4s, v24.4s, v0.4s // .................................................................................e............................................................................................|................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ............................................................................e.................................................................................................|................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................................................................................e..........................................................................................|................................................... + // ldr q0, [x5, #(-12*16 + 6*16)] // .....................e........................................................................................................................................................|....................e.............................. + // ldr q4, [x5, #(-12*16 + 7*16)] // ..e...........................................................................................................................................................................|.e................................................. + // ldr q1, [x5, #(-12*16 + 8*16)] // .................e............................................................................................................................................................|................e.................................. + // ldr q5, [x5, #(-12*16 + 9*16)] // .............................e................................................................................................................................................|............................e...................... + // ldr q2, [x5, #(-12*16 + 10*16)] // e.............................................................................................................................................................................e................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ..................e...........................................................................................................................................................|.................e................................. + // sub v24.4s, v13.4s, v14.4s // ..........................................................................e...................................................................................................|................................................... + // add v13.4s, v13.4s, v14.4s // ...........................................................................e..................................................................................................|................................................... + // mul v14.4s, v24.4s, v1.4s // ...............................................................................e..............................................................................................|................................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .......................................................................................e......................................................................................|................................................... + // mls v14.4s, v24.4s, v8.s[0] // ...................................................................................................e..........................................................................|................................................... + // sub v24.4s, v15.4s, v16.4s // .............................................................................e................................................................................................|................................................... + // add v15.4s, v15.4s, v16.4s // ..............................................................................e...............................................................................................|................................................... + // mul v16.4s, v24.4s, v2.4s // .....................................................................................e........................................................................................|................................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // .........................................................................................e....................................................................................|................................................... + // mls v16.4s, v24.4s, v8.s[0] // .................................................................................................e............................................................................|................................................... + // sub v24.4s, v13.4s, v15.4s // ..................................................................................e...........................................................................................|................................................... + // add v13.4s, v13.4s, v15.4s // .......................................................................................................e......................................................................|................................................... + // mul v15.4s, v24.4s, v0.4s // ..............................................................................................e...............................................................................|................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ............................................................................................e.................................................................................|................................................... + // mls v15.4s, v24.4s, v8.s[0] // ......................................................................................................e.......................................................................|................................................... + // sub v24.4s, v14.4s, v16.4s // ..........................................................................................................e...................................................................|................................................... + // add v14.4s, v14.4s, v16.4s // ...........................................................................................................e..................................................................|................................................... + // mul v16.4s, v24.4s, v0.4s // .................................................................................................................e............................................................|................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................................................................................................e..............................................................|................................................... + // mls v16.4s, v24.4s, v8.s[0] // ...................................................................................................................e..........................................................|................................................... + // trn1 v25.4s, v9.4s, v10.4s // ....................................................................................e.........................................................................................|................................................... + // trn2 v26.4s, v9.4s, v10.4s // ......................................................................................e.......................................................................................|................................................... + // trn1 v27.4s, v11.4s, v12.4s // ..........................................................................................e...................................................................................|................................................... + // trn2 v28.4s, v11.4s, v12.4s // ........................................................................................e.....................................................................................|................................................... + // trn2 v11.2d, v25.2d, v27.2d // ...............................................................................................e..............................................................................|................................................... + // trn2 v12.2d, v26.2d, v28.2d // .............................................................................................e................................................................................|................................................... + // trn1 v9.2d, v25.2d, v27.2d // ................................................................................................e.............................................................................|................................................... + // trn1 v10.2d, v26.2d, v28.2d // ...........................................................................................e..................................................................................|................................................... + // trn1 v25.4s, v13.4s, v14.4s // ......................................................................................................................e.......................................................|................................................... + // trn2 v26.4s, v13.4s, v14.4s // ..................................................................................................................e...........................................................|................................................... + // trn1 v27.4s, v15.4s, v16.4s // ........................................................................................................................e.....................................................|................................................... + // trn2 v28.4s, v15.4s, v16.4s // ..........................................................................................................................e...................................................|................................................... + // trn2 v15.2d, v25.2d, v27.2d // ............................................................................................................................e.................................................|................................................... + // trn2 v16.2d, v26.2d, v28.2d // .............................................................................................................................e................................................|................................................... + // trn1 v13.2d, v25.2d, v27.2d // ...............................................................................................................................e..............................................|................................................... + // trn1 v14.2d, v26.2d, v28.2d // ................................................................................................................................e.............................................|................................................... + // ldr q0, [x4], #64 // ...............................e..............................................................................................................................................|..............................e.................... + // ldr q1, [x4, #(-64 + 16)] // ..........e...................................................................................................................................................................|.........e......................................... + // ldr q2, [x4, #(-64 + 32)] // ........e.....................................................................................................................................................................|.......e........................................... + // ldr q3, [x4, #(-64 + 48)] // ....e.........................................................................................................................................................................|...e............................................... + // sub v24.4s, v9.4s, v10.4s // .............................................................................................................e................................................................|................................................... + // add v9.4s, v9.4s, v10.4s // ....................................................................................................e.........................................................................|................................................... + // mul v10.4s, v24.4s, v1.s[2] // ..............................................................................................................................e...............................................|................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................................................................................................................e........................................................|................................................... + // mls v10.4s, v24.4s, v8.s[0] // .................................................................................................................................e............................................|................................................... + // sub v24.4s, v11.4s, v12.4s // .....................................................................................................e........................................................................|................................................... + // add v11.4s, v11.4s, v12.4s // ..................................................................................................e...........................................................................|................................................... + // mul v12.4s, v24.4s, v2.s[0] // .........................................................................................................e....................................................................|................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................................................................................................................e.............................................................|................................................... + // mls v12.4s, v24.4s, v8.s[0] // .........................................................................................................................e....................................................|................................................... + // sub v24.4s, v13.4s, v14.4s // ......................................................................................................................................e.......................................|................................................... + // add v13.4s, v13.4s, v14.4s // .....................................................................................................................................e........................................|................................................... + // mul v14.4s, v24.4s, v2.s[2] // ................................................................................................................................................e.............................|................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..............................................................................................................................................e...............................|................................................... + // mls v14.4s, v24.4s, v8.s[0] // ...................................................................................................................................................e..........................|................................................... + // sub v24.4s, v15.4s, v16.4s // ..................................................................................................................................e...........................................|................................................... + // add v15.4s, v15.4s, v16.4s // ...................................................................................................................................e..........................................|................................................... + // mul v16.4s, v24.4s, v3.s[0] // .........................................................................................................................................e....................................|................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .......................................................................................................................................e......................................|................................................... + // mls v16.4s, v24.4s, v8.s[0] // .....................................................................................................................................................e........................|................................................... + // sub v24.4s, v9.4s, v11.4s // ........................................................................................................e.....................................................................|................................................... + // add v9.4s, v9.4s, v11.4s // ..............................................................................................................e...............................................................|................................................... + // mul v11.4s, v24.4s, v0.s[2] // ...........................................................................................................................e..................................................|................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ............................................................................................................e.................................................................|................................................... + // mls v11.4s, v24.4s, v8.s[0] // ....................................................................................................................................e.........................................|................................................... + // sub v24.4s, v10.4s, v12.4s // ........................................................................................................................................e.....................................|................................................... + // add v10.4s, v10.4s, v12.4s // ....................................................................................................................................................e.........................|................................................... + // mul v12.4s, v24.4s, v0.s[2] // ............................................................................................................................................e.................................|................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................................................................................................................................e..................................|................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...............................................................................................................................................e..............................|................................................... + // sub v24.4s, v13.4s, v15.4s // .................................................................................................................................................e............................|................................................... + // add v13.4s, v13.4s, v15.4s // ..........................................................................................................................................e...................................|................................................... + // mul v15.4s, v24.4s, v1.s[0] // ..........................................................................................................................................................e...................|................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................................................................................................................................................e.....................|................................................... + // mls v15.4s, v24.4s, v8.s[0] // .................................................................................................................................................................e............|................................................... + // sub v24.4s, v14.4s, v16.4s // ............................................................................................................................................................e.................|................................................... + // add v14.4s, v14.4s, v16.4s // .............................................................................................................................................................e................|................................................... + // mul v16.4s, v24.4s, v1.s[0] // ...................................................................................................................................................................e..........|................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...............................................................................................................................................................e..............|................................................... + // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................................................................................................e.......|................................................... + // srshr v24.4S, v9.4S, #23 // ....................................................................................................................e.........................................................|................................................... + // mls v9.4s, v24.4s, v8.4s // .......................................................................................................................e......................................................|................................................... + // srshr v24.4S, v10.4S, #23 // .........................................................................................................................................................e....................|................................................... + // mls v10.4s, v24.4s, v8.4s // ..............................................................................................................................................................e...............|................................................... + // srshr v24.4S, v13.4S, #23 // .............................................................................................................................................e................................|................................................... + // mls v13.4s, v24.4s, v8.4s // ..................................................................................................................................................e...........................|................................................... + // srshr v24.4S, v14.4S, #23 // ................................................................................................................................................................e.............|................................................... + // mls v14.4s, v24.4s, v8.4s // ..................................................................................................................................................................e...........|................................................... + // sub v24.4s, v9.4s, v13.4s // ......................................................................................................................................................e.......................|................................................... + // add v9.4s, v9.4s, v13.4s // .......................................................................................................................................................e......................|................................................... + // mul v13.4s, v24.4s, v0.s[0] // .....*........................................................................................................................................................................|....*.............................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................................................................................................................................................................e....|................................................... + // mls v13.4s, v24.4s, v8.s[0] // ............*.................................................................................................................................................................|...........*....................................... + // sub v24.4s, v10.4s, v14.4s // .......................................................................................................................................................................e......|................................................... + // add v10.4s, v10.4s, v14.4s // ........................................................................................................................................................................e.....|................................................... + // mul v14.4s, v24.4s, v0.s[0] // ...........................................................................................................................................................................e..|................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................*.............................................................................................................................................................|...............*................................... + // mls v14.4s, v24.4s, v8.s[0] // ...................................*..........................................................................................................................................|..................................*................ + // sub v24.4s, v11.4s, v15.4s // ....................................................................................................................................................................e.........|................................................... + // add v11.4s, v11.4s, v15.4s // .....................................................................................................................................................................e........|................................................... + // mul v15.4s, v24.4s, v0.s[0] // ..............................*...............................................................................................................................................|.............................*..................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .*............................................................................................................................................................................|*.................................................. + // mls v15.4s, v24.4s, v8.s[0] // ........................................*.....................................................................................................................................|.......................................*........... + // sub v24.4s, v12.4s, v16.4s // .........*....................................................................................................................................................................|........*.......................................... + // add v12.4s, v12.4s, v16.4s // .............................................................................................................................................................................e|................................................... + // mul v16.4s, v24.4s, v0.s[0] // ....................*.........................................................................................................................................................|...................*............................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................*....................................................................................................................................................|........................*.......................... + // mls v16.4s, v24.4s, v8.s[0] // .....................................*........................................................................................................................................|....................................*.............. + // str q9, [x1], #(16*4) // ...........................................................................................................................................................e..................|................................................... + // str q10, [x1, #(-16*4 + 1*16)] // ............................................................................................................................................................................e.|................................................... + // str q11, [x1, #(-16*4 + 2*16)] // ..........................................................................................................................................................................e...|................................................... + // str q12, [x1, #(-16*4 + 3*16)] // ......*.......................................................................................................................................................................|.....*............................................. + // str q13, [x2], #(16*4) // ............................................*.................................................................................................................................|...........................................*....... + // str q14, [x2, #(-16*4 + 1*16)] // .............................................*................................................................................................................................|............................................*...... + // str q15, [x2, #(-16*4 + 2*16)] // ..................................................*...........................................................................................................................|.................................................*. + // str q16, [x2, #(-16*4 + 3*16)] // ................................................*.............................................................................................................................|...............................................*... + // add x1, x1, #64 // .......*......................................................................................................................................................................|......*............................................ + // add x2, x2, #64 // ...................................................*..........................................................................................................................|..................................................* sub count, count, #1 cbnz count, layer45678_start - add v17.4S, v24.4S, v6.4S // *............................................................................................................................................... - mls v2.4S, v28.4S, v8.S[0] // ...*............................................................................................................................................ - ldr q21, [x2, #0] // .*.............................................................................................................................................. - ldr q14, [x2, #16] // ..*............................................................................................................................................. - ldr q3, [x5, #16] // ....*........................................................................................................................................... - // gap // ................................................................................................................................................ - mul v24.4S, v26.4S, v30.4S // .......*........................................................................................................................................ - ldr q28, [x2, #32] // .....*.......................................................................................................................................... - ldr q6, [x5, #96] // ......*......................................................................................................................................... - ldr q31, [x2, #48] // ........*....................................................................................................................................... - sub v26.4S, v17.4S, v15.4S // .........*...................................................................................................................................... - ldr q1, [x5], #(12*16) // ...........*.................................................................................................................................... - add v17.4S, v17.4S, v15.4S // ..........*..................................................................................................................................... - mls v24.4S, v13.4S, v8.S[0] // .............*.................................................................................................................................. - ldr q15, [x5, #-80] // ............*................................................................................................................................... - trn1 v29.4S, v21.4S, v14.4S // ..............*................................................................................................................................. - ldr q13, [x5, #-64] // ...............*................................................................................................................................ - // gap // ................................................................................................................................................ - trn2 v21.4S, v21.4S, v14.4S // ................*............................................................................................................................... - sqrdmulh v14.4S, v26.4S, v3.4S // ...................*............................................................................................................................ - ldr q16, [x5, #-16] // .................*.............................................................................................................................. - trn1 v9.4S, v28.4S, v31.4S // ....................*........................................................................................................................... - ldr q0, [x5, #-48] // ..................*............................................................................................................................. - // gap // ................................................................................................................................................ - trn2 v28.4S, v28.4S, v31.4S // ......................*......................................................................................................................... - mul v31.4S, v26.4S, v1.4S // ........................*....................................................................................................................... - ldr q26, [x5, #-32] // .....................*.......................................................................................................................... - sub v30.4S, v2.4S, v24.4S // ..........................*..................................................................................................................... - ldr q7, [x4], #64 // .......................*........................................................................................................................ - // gap // ................................................................................................................................................ - add v2.4S, v2.4S, v24.4S // ............................*................................................................................................................... - trn2 v24.2D, v29.2D, v9.2D // .............................*.................................................................................................................. - ldr q18, [x4, #-48] // .........................*...................................................................................................................... - mls v31.4S, v14.4S, v8.S[0] // ...............................*................................................................................................................ - ldr q25, [x4, #-32] // ...........................*.................................................................................................................... - trn2 v14.2D, v21.2D, v28.2D // ................................*............................................................................................................... - trn1 v29.2D, v29.2D, v9.2D // .................................*.............................................................................................................. - ldr q9, [x4, #-16] // ..............................*................................................................................................................. - // gap // ................................................................................................................................................ - sqrdmulh v3.4S, v30.4S, v3.4S // ..................................*............................................................................................................. - trn1 v12.4S, v17.4S, v2.4S // ...................................*............................................................................................................ - // gap // ................................................................................................................................................ - sub v23.4S, v24.4S, v14.4S // ....................................*........................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v1.4S, v30.4S, v1.4S // ......................................*......................................................................................................... - trn2 v17.4S, v17.4S, v2.4S // .....................................*.......................................................................................................... - // gap // ................................................................................................................................................ - trn1 v21.2D, v21.2D, v28.2D // .......................................*........................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v2.4S, v23.4S, v26.4S // ........................................*....................................................................................................... - add v14.4S, v24.4S, v14.4S // .........................................*...................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v1.4S, v3.4S, v8.S[0] // ..........................................*..................................................................................................... - sub v3.4S, v29.4S, v21.4S // ...........................................*.................................................................................................... - // gap // ................................................................................................................................................ - add v21.4S, v29.4S, v21.4S // ............................................*................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v24.4S, v23.4S, v16.4S // .............................................*.................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v28.4S, v3.4S, v0.4S // ..............................................*................................................................................................. - sub v26.4S, v21.4S, v14.4S // ...............................................*................................................................................................ - // gap // ................................................................................................................................................ - trn1 v29.4S, v31.4S, v1.4S // ...................................................*............................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v3.4S, v3.4S, v13.4S // .................................................*.............................................................................................. - add v21.4S, v21.4S, v14.4S // ................................................*............................................................................................... - // gap // ................................................................................................................................................ - trn2 v14.4S, v31.4S, v1.4S // ..................................................*............................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v2.4S, v24.4S, v8.S[0] // ....................................................*........................................................................................... - trn1 v24.2D, v12.2D, v29.2D // ........................................................*....................................................................................... - // gap // ................................................................................................................................................ - trn2 v31.2D, v12.2D, v29.2D // .......................................................*........................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v3.4S, v28.4S, v8.S[0] // ......................................................*......................................................................................... - trn2 v28.2D, v17.2D, v14.2D // .....................................................*.......................................................................................... - // gap // ................................................................................................................................................ - trn1 v17.2D, v17.2D, v14.2D // .........................................................*...................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v14.4S, v26.4S, v6.4S // ..........................................................*..................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v1.4S, v31.4S, v28.4S // ...........................................................*.................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v26.4S, v26.4S, v15.4S // ............................................................*................................................................................... - add v28.4S, v31.4S, v28.4S // .............................................................*.................................................................................. - // gap // ................................................................................................................................................ - sub v31.4S, v3.4S, v2.4S // ..............................................................*................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v2.4S, v3.4S, v2.4S // ...............................................................*................................................................................ - mul v3.4S, v1.4S, v25.S[0] // ................................................................*............................................................................... - // gap // ................................................................................................................................................ - sub v29.4S, v24.4S, v17.4S // .................................................................*.............................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v6.4S, v31.4S, v6.4S // ..................................................................*............................................................................. - add v17.4S, v24.4S, v17.4S // ...................................................................*............................................................................ - // gap // ................................................................................................................................................ - trn1 v24.4S, v21.4S, v2.4S // ....................................................................*........................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v31.4S, v31.4S, v15.4S // .....................................................................*.......................................................................... - trn2 v21.4S, v21.4S, v2.4S // ......................................................................*......................................................................... - // gap // ................................................................................................................................................ - sub v2.4S, v17.4S, v28.4S // .......................................................................*........................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v14.4S, v26.4S, v8.S[0] // ........................................................................*....................................................................... - add v17.4S, v17.4S, v28.4S // .........................................................................*...................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v28.4S, v29.4S, v18.S[3] // ..........................................................................*..................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v6.4S, v31.4S, v8.S[0] // ...........................................................................*.................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v31.4S, v29.4S, v18.S[2] // ............................................................................*................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v31.4S, v28.4S, v8.S[0] // .............................................................................*.................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - trn1 v28.4S, v14.4S, v6.4S // ..............................................................................*................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v26.4S, v1.4S, v25.S[1] // ...............................................................................*................................................................ - trn2 v14.4S, v14.4S, v6.4S // ................................................................................*............................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v6.4S, v2.4S, v7.S[3] // .................................................................................*.............................................................. - trn2 v1.2D, v24.2D, v28.2D // ..................................................................................*............................................................. - // gap // ................................................................................................................................................ - trn1 v24.2D, v24.2D, v28.2D // ...................................................................................*............................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v2.4S, v2.4S, v7.S[2] // ....................................................................................*........................................................... - trn1 v28.2D, v21.2D, v14.2D // .....................................................................................*.......................................................... - // gap // ................................................................................................................................................ - trn2 v21.2D, v21.2D, v14.2D // ......................................................................................*......................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v3.4S, v26.4S, v8.S[0] // .......................................................................................*........................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v14.4S, v24.4S, v28.4S // ........................................................................................*....................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v2.4S, v6.4S, v8.S[0] // .........................................................................................*...................................................... - add v24.4S, v24.4S, v28.4S // ..........................................................................................*..................................................... - // gap // ................................................................................................................................................ - sub v28.4S, v1.4S, v21.4S // ...........................................................................................*.................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v21.4S, v1.4S, v21.4S // ............................................................................................*................................................... - mul v6.4S, v14.4S, v25.S[2] // .............................................................................................*.................................................. - // gap // ................................................................................................................................................ - sub v26.4S, v31.4S, v3.4S // ..............................................................................................*................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v14.4S, v14.4S, v25.S[3] // ...............................................................................................*................................................ - add v3.4S, v31.4S, v3.4S // ................................................................................................*............................................... - // gap // ................................................................................................................................................ - sub v31.4S, v24.4S, v21.4S // .................................................................................................*.............................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v21.4S, v24.4S, v21.4S // ..................................................................................................*............................................. - sqrdmulh v24.4S, v28.4S, v9.S[1] // ...................................................................................................*............................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v28.4S, v28.4S, v9.S[0] // ....................................................................................................*........................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v1.4S, v17.4S, v21.4S // .....................................................................................................*.......................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v17.4S, v17.4S, v21.4S // ......................................................................................................*......................................... - mls v6.4S, v14.4S, v8.S[0] // .......................................................................................................*........................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v28.4S, v24.4S, v8.S[0] // ........................................................................................................*....................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q17, [x1], #(16*4) // .........................................................................................................*...................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v17.4S, v26.4S, v7.S[2] // ..........................................................................................................*..................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v21.4S, v31.4S, v18.S[0] // ...........................................................................................................*.................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v14.4S, v6.4S, v28.4S // ............................................................................................................*................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v24.4S, v31.4S, v18.S[1] // .............................................................................................................*.................................. - add v28.4S, v6.4S, v28.4S // ..............................................................................................................*................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v6.4S, v14.4S, v18.S[0] // ...............................................................................................................*................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v31.4S, v3.4S, v28.4S // ................................................................................................................*............................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v14.4S, v14.4S, v18.S[1] // .................................................................................................................*.............................. - add v3.4S, v3.4S, v28.4S // ..................................................................................................................*............................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v28.4S, v26.4S, v7.S[3] // ...................................................................................................................*............................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q3, [x1, #-48] // ....................................................................................................................*........................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v21.4S, v24.4S, v8.S[0] // .....................................................................................................................*.......................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v6.4S, v14.4S, v8.S[0] // ......................................................................................................................*......................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v17.4S, v28.4S, v8.S[0] // .......................................................................................................................*........................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v14.4S, v2.4S, v21.4S // ........................................................................................................................*....................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v21.4S, v2.4S, v21.4S // .........................................................................................................................*...................... - mul v2.4S, v1.4S, v7.S[0] // ..........................................................................................................................*..................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v3.4S, v1.4S, v7.S[1] // ...........................................................................................................................*.................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q21, [x1, #-32] // ............................................................................................................................*................... - add v21.4S, v17.4S, v6.4S // .............................................................................................................................*.................. - // gap // ................................................................................................................................................ - sub v17.4S, v17.4S, v6.4S // ..............................................................................................................................*................. - mul v24.4S, v31.4S, v7.S[0] // ...............................................................................................................................*................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q21, [x1, #-16] // .................................................................................................................................*.............. - add x1, x1, #64 // ..................................................................................................................................*............. - sqrdmulh v21.4S, v31.4S, v7.S[1] // ................................................................................................................................*............... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v28.4S, v14.4S, v7.S[0] // ...................................................................................................................................*............ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v14.4S, v14.4S, v7.S[1] // ....................................................................................................................................*........... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v6.4S, v17.4S, v7.S[0] // .....................................................................................................................................*.......... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v17.4S, v17.4S, v7.S[1] // ......................................................................................................................................*......... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v2.4S, v3.4S, v8.S[0] // .......................................................................................................................................*........ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v24.4S, v21.4S, v8.S[0] // ........................................................................................................................................*....... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v28.4S, v14.4S, v8.S[0] // .........................................................................................................................................*...... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q2, [x2], #(16*4) // ..........................................................................................................................................*..... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v6.4S, v17.4S, v8.S[0] // ...........................................................................................................................................*.... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q24, [x2, #-48] // ............................................................................................................................................*... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q28, [x2, #-32] // .............................................................................................................................................*.. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q6, [x2, #-16] // ..............................................................................................................................................*. - add x2, x2, #64 // ...............................................................................................................................................* - // gap // ................................................................................................................................................ + str q29, [x1, #-16] // ..*............... + add x1, x1, #64 // ...*.............. + sqrdmulh v21.4S, v0.4S, v6.S[1] // *................. + // gap // .................. + // gap // .................. + // gap // .................. + mul v26.4S, v0.4S, v6.S[0] // .........*........ + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + mul v1.4S, v5.4S, v6.S[0] // .*................ + sub v5.4S, v27.4S, v18.4S // ....*............. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + mls v1.4S, v19.4S, v8.S[0] // .....*............ + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + mul v0.4S, v5.4S, v6.S[0] // .......*.......... + // gap // .................. + // gap // .................. + // gap // .................. + sqrdmulh v28.4S, v2.4S, v6.S[1] // ......*........... + // gap // .................. + // gap // .................. + str q1, [x2], #(16*4) // .............*.... + // gap // .................. + // gap // .................. + sqrdmulh v5.4S, v5.4S, v6.S[1] // ........*......... + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + mls v26.4S, v21.4S, v8.S[0] // ............*..... + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + mls v13.4S, v28.4S, v8.S[0] // ..........*....... + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + mls v0.4S, v5.4S, v8.S[0] // ...........*...... + // gap // .................. + // gap // .................. + str q26, [x2, #-32] // ................*. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + str q13, [x2, #-48] // ..............*... + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + str q0, [x2, #-16] // ...............*.. + add x2, x2, #64 // .................* + // gap // .................. // original source code - // add v17.4S, v24.4S, v6.4S // *............................................................................................................................................... - // ldr q21, [x2, #0] // ..*............................................................................................................................................. - // ldr q14, [x2, #16] // ...*............................................................................................................................................ - // mls v2.4S, v28.4S, v8.S[0] // .*.............................................................................................................................................. - // ldr q24, [x5, #16] // ....*........................................................................................................................................... - // ldr q28, [x2, #32] // ......*......................................................................................................................................... - // ldr q6, [x5, #96] // .......*........................................................................................................................................ - // mul v31.4S, v26.4S, v30.4S // .....*.......................................................................................................................................... - // ldr q26, [x2, #48] // ........*....................................................................................................................................... - // sub v1.4S, v17.4S, v15.4S // .........*...................................................................................................................................... - // add v17.4S, v17.4S, v15.4S // ...........*.................................................................................................................................... - // ldr q15, [x5], #(12*16) // ..........*..................................................................................................................................... - // ldr q29, [x5, #-80] // .............*.................................................................................................................................. - // mls v31.4S, v13.4S, v8.S[0] // ............*................................................................................................................................... - // trn1 v13.4S, v21.4S, v14.4S // ..............*................................................................................................................................. - // ldr q16, [x5, #-64] // ...............*................................................................................................................................ - // trn2 v21.4S, v21.4S, v14.4S // ................*............................................................................................................................... - // ldr q14, [x5, #-16] // ..................*............................................................................................................................. - // ldr q9, [x5, #-48] // ....................*........................................................................................................................... - // sqrdmulh v0.4S, v1.4S, v24.4S // .................*.............................................................................................................................. - // trn1 v30.4S, v28.4S, v26.4S // ...................*............................................................................................................................ - // ldr q7, [x5, #-32] // .......................*........................................................................................................................ - // trn2 v28.4S, v28.4S, v26.4S // .....................*.......................................................................................................................... - // ldr q26, [x4], #64 // .........................*...................................................................................................................... - // mul v1.4S, v1.4S, v15.4S // ......................*......................................................................................................................... - // ldr q18, [x4, #-48] // ............................*................................................................................................................... - // sub v25.4S, v2.4S, v31.4S // ........................*....................................................................................................................... - // ldr q12, [x4, #-32] // ..............................*................................................................................................................. - // add v2.4S, v2.4S, v31.4S // ..........................*..................................................................................................................... - // trn2 v31.2D, v13.2D, v30.2D // ...........................*.................................................................................................................... - // ldr q23, [x4, #-16] // .................................*.............................................................................................................. - // mls v1.4S, v0.4S, v8.S[0] // .............................*.................................................................................................................. - // trn2 v0.2D, v21.2D, v28.2D // ...............................*................................................................................................................ - // trn1 v13.2D, v13.2D, v30.2D // ................................*............................................................................................................... - // sqrdmulh v24.4S, v25.4S, v24.4S // ..................................*............................................................................................................. - // trn1 v4.4S, v17.4S, v2.4S // ...................................*............................................................................................................ - // sub v11.4S, v31.4S, v0.4S // ....................................*........................................................................................................... - // trn2 v17.4S, v17.4S, v2.4S // ......................................*......................................................................................................... - // mul v2.4S, v25.4S, v15.4S // .....................................*.......................................................................................................... - // trn1 v21.2D, v21.2D, v28.2D // .......................................*........................................................................................................ - // mul v28.4S, v11.4S, v7.4S // ........................................*....................................................................................................... - // add v31.4S, v31.4S, v0.4S // .........................................*...................................................................................................... - // mls v2.4S, v24.4S, v8.S[0] // ..........................................*..................................................................................................... - // sub v24.4S, v13.4S, v21.4S // ...........................................*.................................................................................................... - // add v21.4S, v13.4S, v21.4S // ............................................*................................................................................................... - // sqrdmulh v14.4S, v11.4S, v14.4S // .............................................*.................................................................................................. - // sqrdmulh v15.4S, v24.4S, v9.4S // ..............................................*................................................................................................. - // sub v13.4S, v21.4S, v31.4S // ...............................................*................................................................................................ - // add v21.4S, v21.4S, v31.4S // ..................................................*............................................................................................. - // mul v24.4S, v24.4S, v16.4S // .................................................*.............................................................................................. - // trn2 v31.4S, v1.4S, v2.4S // ...................................................*............................................................................................ - // trn1 v2.4S, v1.4S, v2.4S // ................................................*............................................................................................... - // mls v28.4S, v14.4S, v8.S[0] // ....................................................*........................................................................................... - // trn2 v14.2D, v17.2D, v31.2D // ........................................................*....................................................................................... - // mls v24.4S, v15.4S, v8.S[0] // .......................................................*........................................................................................ - // trn2 v1.2D, v4.2D, v2.2D // ......................................................*......................................................................................... - // trn1 v2.2D, v4.2D, v2.2D // .....................................................*.......................................................................................... - // trn1 v17.2D, v17.2D, v31.2D // .........................................................*...................................................................................... - // mul v31.4S, v13.4S, v6.4S // ..........................................................*..................................................................................... - // sub v15.4S, v1.4S, v14.4S // ...........................................................*.................................................................................... - // sqrdmulh v13.4S, v13.4S, v29.4S // ............................................................*................................................................................... - // add v14.4S, v1.4S, v14.4S // .............................................................*.................................................................................. - // sub v1.4S, v24.4S, v28.4S // ..............................................................*................................................................................. - // add v24.4S, v24.4S, v28.4S // ...............................................................*................................................................................ - // mul v28.4S, v15.4S, v12.S[0] // ................................................................*............................................................................... - // sub v16.4S, v2.4S, v17.4S // .................................................................*.............................................................................. - // mul v6.4S, v1.4S, v6.4S // ..................................................................*............................................................................. - // add v17.4S, v2.4S, v17.4S // ...................................................................*............................................................................ - // trn1 v2.4S, v21.4S, v24.4S // ....................................................................*........................................................................... - // sqrdmulh v1.4S, v1.4S, v29.4S // .....................................................................*.......................................................................... - // trn2 v21.4S, v21.4S, v24.4S // ......................................................................*......................................................................... - // sub v24.4S, v17.4S, v14.4S // .......................................................................*........................................................................ - // mls v31.4S, v13.4S, v8.S[0] // ........................................................................*....................................................................... - // add v17.4S, v17.4S, v14.4S // .........................................................................*...................................................................... - // sqrdmulh v14.4S, v16.4S, v18.S[3] // ..........................................................................*..................................................................... - // mls v6.4S, v1.4S, v8.S[0] // ...........................................................................*.................................................................... - // mul v1.4S, v16.4S, v18.S[2] // ............................................................................*................................................................... - // mls v1.4S, v14.4S, v8.S[0] // .............................................................................*.................................................................. - // trn1 v14.4S, v31.4S, v6.4S // ..............................................................................*................................................................. - // sqrdmulh v15.4S, v15.4S, v12.S[1] // ...............................................................................*................................................................ - // trn2 v6.4S, v31.4S, v6.4S // ................................................................................*............................................................... - // sqrdmulh v31.4S, v24.4S, v26.S[3] // .................................................................................*.............................................................. - // trn2 v29.2D, v2.2D, v14.2D // ..................................................................................*............................................................. - // trn1 v14.2D, v2.2D, v14.2D // ...................................................................................*............................................................ - // mul v2.4S, v24.4S, v26.S[2] // ....................................................................................*........................................................... - // trn1 v24.2D, v21.2D, v6.2D // .....................................................................................*.......................................................... - // trn2 v21.2D, v21.2D, v6.2D // ......................................................................................*......................................................... - // mls v28.4S, v15.4S, v8.S[0] // .......................................................................................*........................................................ - // sub v6.4S, v14.4S, v24.4S // ........................................................................................*....................................................... - // mls v2.4S, v31.4S, v8.S[0] // .........................................................................................*...................................................... - // add v14.4S, v14.4S, v24.4S // ..........................................................................................*..................................................... - // sub v24.4S, v29.4S, v21.4S // ...........................................................................................*.................................................... - // add v21.4S, v29.4S, v21.4S // ............................................................................................*................................................... - // mul v31.4S, v6.4S, v12.S[2] // .............................................................................................*.................................................. - // sub v15.4S, v1.4S, v28.4S // ..............................................................................................*................................................. - // sqrdmulh v6.4S, v6.4S, v12.S[3] // ...............................................................................................*................................................ - // add v28.4S, v1.4S, v28.4S // ................................................................................................*............................................... - // sub v1.4S, v14.4S, v21.4S // .................................................................................................*.............................................. - // add v21.4S, v14.4S, v21.4S // ..................................................................................................*............................................. - // sqrdmulh v14.4S, v24.4S, v23.S[1] // ...................................................................................................*............................................ - // mul v24.4S, v24.4S, v23.S[0] // ....................................................................................................*........................................... - // sub v29.4S, v17.4S, v21.4S // .....................................................................................................*.......................................... - // add v17.4S, v17.4S, v21.4S // ......................................................................................................*......................................... - // mls v31.4S, v6.4S, v8.S[0] // .......................................................................................................*........................................ - // mls v24.4S, v14.4S, v8.S[0] // ........................................................................................................*....................................... - // str q17, [x1], #(16*4) // .........................................................................................................*...................................... - // mul v17.4S, v15.4S, v26.S[2] // ..........................................................................................................*..................................... - // mul v21.4S, v1.4S, v18.S[0] // ...........................................................................................................*.................................... - // sub v14.4S, v31.4S, v24.4S // ............................................................................................................*................................... - // sqrdmulh v6.4S, v1.4S, v18.S[1] // .............................................................................................................*.................................. - // add v24.4S, v31.4S, v24.4S // ..............................................................................................................*................................. - // mul v31.4S, v14.4S, v18.S[0] // ...............................................................................................................*................................ - // sub v1.4S, v28.4S, v24.4S // ................................................................................................................*............................... - // sqrdmulh v14.4S, v14.4S, v18.S[1] // .................................................................................................................*.............................. - // add v24.4S, v28.4S, v24.4S // ..................................................................................................................*............................. - // sqrdmulh v28.4S, v15.4S, v26.S[3] // ...................................................................................................................*............................ - // str q24, [x1, #-48] // ....................................................................................................................*........................... - // mls v21.4S, v6.4S, v8.S[0] // .....................................................................................................................*.......................... - // mls v31.4S, v14.4S, v8.S[0] // ......................................................................................................................*......................... - // mls v17.4S, v28.4S, v8.S[0] // .......................................................................................................................*........................ - // sub v14.4S, v2.4S, v21.4S // ........................................................................................................................*....................... - // add v21.4S, v2.4S, v21.4S // .........................................................................................................................*...................... - // mul v2.4S, v29.4S, v26.S[0] // ..........................................................................................................................*..................... - // sqrdmulh v24.4S, v29.4S, v26.S[1] // ...........................................................................................................................*.................... - // str q21, [x1, #-32] // ............................................................................................................................*................... - // add v21.4S, v17.4S, v31.4S // .............................................................................................................................*.................. - // sub v17.4S, v17.4S, v31.4S // ..............................................................................................................................*................. - // mul v28.4S, v1.4S, v26.S[0] // ...............................................................................................................................*................ - // sqrdmulh v6.4S, v1.4S, v26.S[1] // ..................................................................................................................................*............. - // str q21, [x1, #-16] // ................................................................................................................................*............... - // add x1, x1, #64 // .................................................................................................................................*.............. - // mul v1.4S, v14.4S, v26.S[0] // ...................................................................................................................................*............ - // sqrdmulh v14.4S, v14.4S, v26.S[1] // ....................................................................................................................................*........... - // mul v16.4S, v17.4S, v26.S[0] // .....................................................................................................................................*.......... - // sqrdmulh v17.4S, v17.4S, v26.S[1] // ......................................................................................................................................*......... - // mls v2.4S, v24.4S, v8.S[0] // .......................................................................................................................................*........ - // mls v28.4S, v6.4S, v8.S[0] // ........................................................................................................................................*....... - // mls v1.4S, v14.4S, v8.S[0] // .........................................................................................................................................*...... - // str q2, [x2], #(16*4) // ..........................................................................................................................................*..... - // mls v16.4S, v17.4S, v8.S[0] // ...........................................................................................................................................*.... - // str q28, [x2, #-48] // ............................................................................................................................................*... - // str q1, [x2, #-32] // .............................................................................................................................................*.. - // str q16, [x2, #-16] // ..............................................................................................................................................*. - // add x2, x2, #64 // ...............................................................................................................................................* + // sqrdmulh v9.4S, v0.4S, v6.S[1] // ..*............... + // mul v15.4S, v5.4S, v6.S[0] // ....*............. + // str q29, [x1, #-16] // *................. + // add x1, x1, #64 // .*................ + // sub v5.4S, v27.4S, v18.4S // .....*............ + // mls v15.4S, v19.4S, v8.S[0] // ......*........... + // sqrdmulh v2.4S, v2.4S, v6.S[1] // ........*......... + // mul v19.4S, v5.4S, v6.S[0] // .......*.......... + // sqrdmulh v29.4S, v5.4S, v6.S[1] // ..........*....... + // mul v5.4S, v0.4S, v6.S[0] // ...*.............. + // mls v13.4S, v2.4S, v8.S[0] // ............*..... + // mls v19.4S, v29.4S, v8.S[0] // .............*.... + // mls v5.4S, v9.4S, v8.S[0] // ...........*...... + // str q15, [x2], #(16*4) // .........*........ + // str q13, [x2, #-48] // ...............*.. + // str q19, [x2, #-16] // ................*. + // str q5, [x2, #-32] // ..............*... + // add x2, x2, #64 // .................* // ----------------------------------------------------------------------------- ninv .req v25 ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 ASM_LOAD(xtmp, ninv_addr) ld1r {ninv.4s}, [xtmp] ASM_LOAD(xtmp, ninv_tw_addr) ld1r {ninv_tw.4s}, [xtmp] + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + mov count, #8 ASM_LOAD(r_ptr0, roots_l012) load_roots_123 .p2align 2 - ldr q21, [x0, #768] // .*........... - ldr q17, [x0, #896] // ...*......... + ldr q5, [x0, #768] // .*........... + ldr q19, [x0, #896] // .....*....... // gap // ............. - ldr q6, [x0, #256] // *............ + ldr q13, [x0, #256] // *............ // gap // ............. // gap // ............. - ldr q29, [x0, #384] // ..*.......... + ldr q17, [x0, #384] // ..*.......... // gap // ............. // gap // ............. - ldr q9, [x0, #512] // ....*........ + ldr q22, [x0, #512] // ...*......... // gap // ............. // gap // ............. - add v23.4S, v21.4S, v17.4S // .......*..... - sub v21.4S, v21.4S, v17.4S // ......*...... - ldr q7, [x0, #640] // .....*....... + add v23.4S, v5.4S, v19.4S // .......*..... + sub v5.4S, v5.4S, v19.4S // ......*...... + ldr q4, [x0, #640] // ....*........ // gap // ............. // gap // ............. // gap // ............. // gap // ............. // gap // ............. // gap // ............. - sqrdmulh v17.4S, v21.4S, v3.S[1] // .........*... + sqrdmulh v19.4S, v5.4S, v3.S[1] // .........*... // gap // ............. // gap // ............. - sub v30.4S, v9.4S, v7.4S // ........*.... + sub v20.4S, v22.4S, v4.4S // ........*.... // gap // ............. // gap // ............. - mul v10.4S, v21.4S, v3.S[0] // ..........*.. + mul v21.4S, v5.4S, v3.S[0] // ..........*.. // gap // ............. // gap // ............. // gap // ............. // gap // ............. // gap // ............. - sqrdmulh v18.4S, v30.4S, v2.S[3] // ...........*. + sqrdmulh v11.4S, v20.4S, v2.S[3] // ...........*. // gap // ............. // gap // ............. // gap // ............. // gap // ............. // gap // ............. - mls v10.4S, v17.4S, v8.S[0] // ............* + mls v21.4S, v19.4S, v8.S[0] // ............* // gap // ............. // gap // ............. // original source code - // ldr q6, [x0, #256] // ..*.......... - // ldr q15, [x0, #768] // *............ - // ldr q29, [x0, #384] // ...*......... - // ldr q13, [x0, #896] // .*........... - // ldr q9, [x0, #512] // ....*........ - // ldr q7, [x0, #640] // .......*..... - // sub v18.4S, v15.4S, v13.4S // ......*...... - // add v23.4S, v15.4S, v13.4S // .....*....... - // sub v30.4S, v9.4S, v7.4S // .........*... - // sqrdmulh v21.4S, v18.4S, v3.S[1] // ........*.... - // mul v10.4S, v18.4S, v3.S[0] // ..........*.. - // sqrdmulh v18.4S, v30.4S, v2.S[3] // ...........*. - // mls v10.4S, v21.4S, v8.S[0] // ............* + // ldr q13, [x0, #256] // ..*.......... + // ldr q27, [x0, #768] // *............ + // ldr q17, [x0, #384] // ...*......... + // ldr q22, [x0, #512] // ....*........ + // ldr q4, [x0, #640] // .......*..... + // ldr q20, [x0, #896] // .*........... + // sub v6.4S, v27.4S, v20.4S // ......*...... + // add v23.4S, v27.4S, v20.4S // .....*....... + // sub v20.4S, v22.4S, v4.4S // .........*... + // sqrdmulh v15.4S, v6.4S, v3.S[1] // ........*.... + // mul v21.4S, v6.4S, v3.S[0] // ..........*.. + // sqrdmulh v11.4S, v20.4S, v2.S[3] // ...........*. + // mls v21.4S, v15.4S, v8.S[0] // ............* sub count, count, #1 layer123_start: - ldr q17, [x0, #0] // *............................................................................................... - ldr q21, [x0, #128] // .*.............................................................................................. - sub v14.4S, v6.4S, v29.4S // .............*.................................................................................. - mul v24.4S, v30.4S, v2.S[2] // ....................*........................................................................... - add v28.4S, v6.4S, v29.4S // ..............*................................................................................. - ldr q6, [x0, #272] // ..e............................................................................................. - add v31.4S, v9.4S, v7.4S // ...................*............................................................................ - ldr q15, [x0, #784] // ......e......................................................................................... - ldr q29, [x0, #400] // ...e............................................................................................ - ldr q13, [x0, #912] // .......e........................................................................................ - mul v16.4S, v14.4S, v2.S[0] // ...............*................................................................................ - ldr q9, [x0, #528] // ....e........................................................................................... - sub v30.4S, v17.4S, v21.4S // ........*....................................................................................... - ldr q7, [x0, #656] // .....e.......................................................................................... - // gap // ................................................................................................ - add v17.4S, v17.4S, v21.4S // .........*...................................................................................... - sqrdmulh v21.4S, v14.4S, v2.S[1] // ................*............................................................................... - // gap // ................................................................................................ - sub v14.4S, v31.4S, v23.4S // ......................................*......................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v24.4S, v18.4S, v8.S[0] // ......................*......................................................................... - sub v18.4S, v15.4S, v13.4S // .......................e........................................................................ - // gap // ................................................................................................ - sub v12.4S, v17.4S, v28.4S // ............................*................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - add v17.4S, v17.4S, v28.4S // .............................*.................................................................. - mul v28.4S, v30.4S, v1.S[2] // ..........*..................................................................................... - // gap // ................................................................................................ - add v31.4S, v31.4S, v23.4S // .......................................*........................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - add v23.4S, v15.4S, v13.4S // ........................e....................................................................... - mls v16.4S, v21.4S, v8.S[0] // .................*.............................................................................. - // gap // ................................................................................................ - sub v21.4S, v24.4S, v10.4S // ...........................................*.................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v15.4S, v30.4S, v1.S[3] // ...........*.................................................................................... - add v24.4S, v24.4S, v10.4S // ............................................*................................................... - // gap // ................................................................................................ - sub v13.4S, v17.4S, v31.4S // ................................................*............................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - add v17.4S, v17.4S, v31.4S // .................................................*.............................................. - mul v31.4S, v12.4S, v0.S[2] // ..............................*................................................................. - // gap // ................................................................................................ - sub v30.4S, v9.4S, v7.4S // ..................e............................................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v12.4S, v12.4S, v0.S[3] // ...............................*................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v10.4S, v14.4S, v1.S[0] // ........................................*....................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v14.4S, v14.4S, v1.S[1] // .........................................*...................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v28.4S, v15.4S, v8.S[0] // ............*................................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v31.4S, v12.4S, v8.S[0] // ................................*............................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v10.4S, v14.4S, v8.S[0] // ..........................................*..................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - sub v14.4S, v28.4S, v16.4S // .................................*.............................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - add v28.4S, v28.4S, v16.4S // ..................................*............................................................. - mul v15.4S, v21.4S, v1.S[0] // .............................................*.................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v16.4S, v14.4S, v0.S[2] // ...................................*............................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sub v12.4S, v28.4S, v24.4S // .....................................................*.......................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - add v24.4S, v28.4S, v24.4S // ......................................................*......................................... - sqrdmulh v14.4S, v14.4S, v0.S[3] // ....................................*........................................................... - // gap // ................................................................................................ - sub v28.4S, v31.4S, v10.4S // ..........................................................*..................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v21.4S, v21.4S, v1.S[1] // ..............................................*................................................. - add v31.4S, v31.4S, v10.4S // ...........................................................*.................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v10.4S, v13.4S, v0.S[0] // ..................................................*............................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v16.4S, v14.4S, v8.S[0] // .....................................*.......................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v15.4S, v21.4S, v8.S[0] // ...............................................*................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v21.4S, v13.4S, v0.S[1] // ...................................................*............................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v14.4S, v12.4S, v0.S[0] // .......................................................*........................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sub v13.4S, v16.4S, v15.4S // ...............................................................*................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - add v15.4S, v16.4S, v15.4S // ................................................................*............................... - mul v16.4S, v17.4S, v25.4S // ................................................................................*............... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v17.4S, v17.4S, v26.4S // .................................................................................*.............. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v10.4S, v21.4S, v8.S[0] // ....................................................*........................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v21.4S, v12.4S, v0.S[1] // ........................................................*....................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v12.4S, v28.4S, v0.S[0] // ............................................................*................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - srshr v4.4S, v10.4S, #23 // ....................................................................*........................... - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v28.4S, v28.4S, v0.S[1] // .............................................................*.................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v14.4S, v21.4S, v8.S[0] // .........................................................*...................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v21.4S, v13.4S, v0.S[0] // .................................................................*.............................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v12.4S, v28.4S, v8.S[0] // ..............................................................*................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - srshr v28.4S, v14.4S, #23 // ......................................................................*......................... - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v13.4S, v13.4S, v0.S[1] // ..................................................................*............................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v10.4S, v4.4S, v8.4S // .....................................................................*.......................... - // gap // ................................................................................................ - // gap // ................................................................................................ - srshr v4.4S, v12.4S, #23 // ........................................................................*....................... - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v14.4S, v28.4S, v8.4S // .......................................................................*........................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v21.4S, v13.4S, v8.S[0] // ...................................................................*............................ - // gap // ................................................................................................ - // gap // ................................................................................................ - str q10, [x0, #512] // ............................................................................*................... - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v12.4S, v4.4S, v8.4S // .........................................................................*...................... - // gap // ................................................................................................ - // gap // ................................................................................................ - str q14, [x0, #640] // .............................................................................*.................. - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v16.4S, v17.4S, v8.S[0] // ..................................................................................*............. - // gap // ................................................................................................ - // gap // ................................................................................................ - srshr v17.4S, v21.4S, #23 // ..........................................................................*..................... - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v14.4S, v24.4S, v25.4S // ...................................................................................*............ - // gap // ................................................................................................ - // gap // ................................................................................................ - str q12, [x0, #768] // ..............................................................................*................. - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v21.4S, v17.4S, v8.4S // ...........................................................................*.................... - // gap // ................................................................................................ - // gap // ................................................................................................ - str q16, [x0], #(16) // ............................................................................................*... - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v17.4S, v24.4S, v26.4S // ....................................................................................*........... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v24.4S, v31.4S, v25.4S // ......................................................................................*......... - // gap // ................................................................................................ - // gap // ................................................................................................ - str q21, [x0, #880] // ...............................................................................*................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v21.4S, v31.4S, v26.4S // .......................................................................................*........ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v14.4S, v17.4S, v8.S[0] // .....................................................................................*.......... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v17.4S, v15.4S, v25.4S // .........................................................................................*...... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v28.4S, v15.4S, v26.4S // ..........................................................................................*..... - // gap // ................................................................................................ - // gap // ................................................................................................ - str q14, [x0, #112] // .............................................................................................*.. - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v24.4S, v21.4S, v8.S[0] // ........................................................................................*....... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v21.4S, v18.4S, v3.S[1] // ..........................e..................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v17.4S, v28.4S, v8.S[0] // ...........................................................................................*.... - // gap // ................................................................................................ - // gap // ................................................................................................ - str q24, [x0, #240] // ..............................................................................................*. - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v10.4S, v18.4S, v3.S[0] // .........................e...................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v18.4S, v30.4S, v2.S[3] // .....................e.......................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - str q17, [x0, #368] // ...............................................................................................* - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v10.4S, v21.4S, v8.S[0] // ...........................e.................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ + ldr q19, [x0, #0] // *....................................................................................................................... + ldr q5, [x0, #128] // .*...................................................................................................................... + sub v9.4S, v13.4S, v17.4S // .............*.......................................................................................................... + add v15.4S, v13.4S, v17.4S // ..............*......................................................................................................... + mul v12.4S, v20.4S, v2.S[2] // ....................*................................................................................................... + ldr q13, [x0, #272] // ..e..................................................................................................................... + add v18.4S, v22.4S, v4.4S // ...................*.................................................................................................... + ldr q27, [x0, #784] // ......e................................................................................................................. + ldr q17, [x0, #400] // ...e.................................................................................................................... + mul v28.4S, v9.4S, v2.S[0] // ...............*........................................................................................................ + ldr q22, [x0, #528] // ....e................................................................................................................... + ldr q4, [x0, #656] // .....e.................................................................................................................. + add v16.4S, v19.4S, v5.4S // .........*.............................................................................................................. + ldr q20, [x0, #912] // .......e................................................................................................................ + // gap // ........................................................................................................................ + sub v19.4S, v19.4S, v5.4S // ........*............................................................................................................... + sqrdmulh v5.4S, v9.4S, v2.S[1] // ................*....................................................................................................... + // gap // ........................................................................................................................ + sub v9.4S, v18.4S, v23.4S // ......................................*................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v12.4S, v11.4S, v8.S[0] // ......................*................................................................................................. + sub v11.4S, v16.4S, v15.4S // ............................*........................................................................................... + // gap // ........................................................................................................................ + add v15.4S, v16.4S, v15.4S // .............................*.......................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v18.4S, v18.4S, v23.4S // .......................................*................................................................................ + mul v16.4S, v19.4S, v1.S[2] // ..........*............................................................................................................. + // gap // ........................................................................................................................ + sub v6.4S, v27.4S, v20.4S // .......................e................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v23.4S, v27.4S, v20.4S // ........................e............................................................................................... + mls v28.4S, v5.4S, v8.S[0] // .................*...................................................................................................... + // gap // ........................................................................................................................ + sub v5.4S, v12.4S, v21.4S // ...........................................*............................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v19.4S, v19.4S, v1.S[3] // ...........*............................................................................................................ + add v12.4S, v12.4S, v21.4S // ............................................*........................................................................... + // gap // ........................................................................................................................ + sub v27.4S, v15.4S, v18.4S // ................................................*....................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v15.4S, v15.4S, v18.4S // .................................................*...................................................................... + mul v18.4S, v11.4S, v0.S[2] // ..............................*......................................................................................... + // gap // ........................................................................................................................ + sub v20.4S, v22.4S, v4.4S // ..................e..................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v11.4S, v11.4S, v0.S[3] // ...............................*........................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v21.4S, v9.4S, v1.S[0] // ........................................*............................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v9.4S, v9.4S, v1.S[1] // .........................................*.............................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v16.4S, v19.4S, v8.S[0] // ............*........................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v18.4S, v11.4S, v8.S[0] // ................................*....................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v21.4S, v9.4S, v8.S[0] // ..........................................*............................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v19.4S, v16.4S, v28.4S // .................................*...................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v9.4S, v16.4S, v28.4S // ..................................*..................................................................................... + mul v28.4S, v5.4S, v1.S[0] // .............................................*.......................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v16.4S, v19.4S, v0.S[2] // ...................................*.................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v11.4S, v9.4S, v12.4S // .....................................................*.................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v9.4S, v9.4S, v12.4S // ......................................................*................................................................. + sqrdmulh v19.4S, v19.4S, v0.S[3] // ....................................*................................................................................... + // gap // ........................................................................................................................ + sub v12.4S, v18.4S, v21.4S // ..........................................................*............................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v5.4S, v5.4S, v1.S[1] // ..............................................*......................................................................... + add v18.4S, v18.4S, v21.4S // ...........................................................*............................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v21.4S, v27.4S, v0.S[0] // ..................................................*..................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v16.4S, v19.4S, v8.S[0] // .....................................*.................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v28.4S, v5.4S, v8.S[0] // ...............................................*........................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v19.4S, v27.4S, v0.S[1] // ...................................................*.................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v5.4S, v11.4S, v0.S[0] // .......................................................*................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v27.4S, v16.4S, v28.4S // ...............................................................*........................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v28.4S, v16.4S, v28.4S // ................................................................*....................................................... + mul v16.4S, v15.4S, v25.4S // ........................................................................................*............................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v15.4S, v15.4S, v26.4S // .........................................................................................*.............................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v21.4S, v19.4S, v8.S[0] // ....................................................*................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v19.4S, v11.4S, v0.S[1] // ........................................................*............................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v11.4S, v12.4S, v0.S[0] // ............................................................*........................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v24.4S, v31.4S, v21.4S // ....................................................................*................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v12.4S, v12.4S, v0.S[1] // .............................................................*.......................................................... + cmge v7.4S, v21.4S, v30.4S // .....................................................................*.................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v5.4S, v19.4S, v8.S[0] // .........................................................*.............................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v19.4S, v24.4S, v7.4S // ......................................................................*................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v24.4S, v27.4S, v0.S[0] // .................................................................*...................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v11.4S, v12.4S, v8.S[0] // ..............................................................*......................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v12.4S, v31.4S, v5.4S // ........................................................................*............................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v27.4S, v27.4S, v0.S[1] // ..................................................................*..................................................... + cmge v7.4S, v5.4S, v30.4S // .........................................................................*.............................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v21.4S, v19.4S, v29.4S // .......................................................................*................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v19.4S, v12.4S, v7.4S // ..........................................................................*............................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v16.4S, v15.4S, v8.S[0] // ..........................................................................................*............................. + cmge v12.4S, v31.4S, v11.4S // ............................................................................*........................................... + // gap // ........................................................................................................................ + cmge v15.4S, v11.4S, v30.4S // .............................................................................*.......................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v24.4S, v27.4S, v8.S[0] // ...................................................................*.................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q21, [x0, #512] // ....................................................................................*................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v5.4S, v19.4S, v29.4S // ...........................................................................*............................................ + sub v19.4S, v12.4S, v15.4S // ..............................................................................*......................................... + // gap // ........................................................................................................................ + cmge v12.4S, v31.4S, v16.4S // ....................................................................................................*................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v15.4S, v16.4S, v30.4S // .....................................................................................................*.................. + mul v27.4S, v9.4S, v25.4S // ...........................................................................................*............................ + // gap // ........................................................................................................................ + cmge v21.4S, v31.4S, v24.4S // ................................................................................*....................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v11.4S, v19.4S, v29.4S // ...............................................................................*........................................ + cmge v19.4S, v24.4S, v30.4S // .................................................................................*...................................... + // gap // ........................................................................................................................ + str q5, [x0, #640] // .....................................................................................*.................................. + sub v5.4S, v12.4S, v15.4S // ......................................................................................................*................. + // gap // ........................................................................................................................ + sqrdmulh v9.4S, v9.4S, v26.4S // ............................................................................................*........................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v19.4S, v21.4S, v19.4S // ..................................................................................*..................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v12.4S, v18.4S, v25.4S // ..............................................................................................*......................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q11, [x0, #768] // ......................................................................................*................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v15.4S, v18.4S, v26.4S // ...............................................................................................*........................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v27.4S, v9.4S, v8.S[0] // .............................................................................................*.......................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v9.4S, v28.4S, v26.4S // ..................................................................................................*..................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v12.4S, v15.4S, v8.S[0] // ................................................................................................*....................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v15.4S, v31.4S, v27.4S // ........................................................................................................*............... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v18.4S, v28.4S, v25.4S // .................................................................................................*...................... + cmge v28.4S, v27.4S, v30.4S // .........................................................................................................*.............. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v18.4S, v9.4S, v8.S[0] // ...................................................................................................*.................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v9.4S, v15.4S, v28.4S // ..........................................................................................................*............. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v24.4S, v19.4S, v29.4S // ...................................................................................*.................................... + cmge v19.4S, v31.4S, v12.4S // ............................................................................................................*........... + // gap // ........................................................................................................................ + cmge v15.4S, v12.4S, v30.4S // .............................................................................................................*.......... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v16.4S, v5.4S, v29.4S // .......................................................................................................*................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v5.4S, v31.4S, v18.4S // ................................................................................................................*....... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v27.4S, v9.4S, v29.4S // ...........................................................................................................*............ + cmge v9.4S, v18.4S, v30.4S // .................................................................................................................*...... + // gap // ........................................................................................................................ + str q24, [x0, #896] // .......................................................................................*................................ + sub v19.4S, v19.4S, v15.4S // ..............................................................................................................*......... + // gap // ........................................................................................................................ + sqrdmulh v15.4S, v6.4S, v3.S[1] // ..........................e............................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q16, [x0], #(16) // ....................................................................................................................*... + sub v5.4S, v5.4S, v9.4S // ..................................................................................................................*..... + // gap // ........................................................................................................................ + mls v12.4S, v19.4S, v29.4S // ...............................................................................................................*........ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q27, [x0, #112] // .....................................................................................................................*.. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v18.4S, v5.4S, v29.4S // ...................................................................................................................*.... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v21.4S, v6.4S, v3.S[0] // .........................e.............................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q12, [x0, #240] // ......................................................................................................................*. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v11.4S, v20.4S, v2.S[3] // .....................e.................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q18, [x0, #368] // .......................................................................................................................* + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v21.4S, v15.4S, v8.S[0] // ...........................e............................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ // original source code - // ldr q9, [x0, #0] // ...........................................................................................*.............................................................................................. - // ldr q10, [x0, #(1*(1024/8))] // ...........................................................................................|*............................................................................................. - // ldr q11, [x0, #(2*(1024/8))] // e..........................................................................................|....e......................................................................................... - // ldr q12, [x0, #(3*(1024/8))] // ...e.......................................................................................|.......e...................................................................................... - // ldr q13, [x0, #(4*(1024/8))] // ......e....................................................................................|..........e................................................................................... - // ldr q14, [x0, #(5*(1024/8))] // ........e..................................................................................|............e................................................................................. - // ldr q15, [x0, #(6*(1024/8))] // ..e........................................................................................|......e....................................................................................... - // ldr q16, [x0, #(7*(1024/8))] // ....e......................................................................................|........e..................................................................................... - // sub v24.4s, v9.4s, v10.4s // .......*...................................................................................|...........*.................................................................................. - // add v9.4s, v9.4s, v10.4s // .........*.................................................................................|.............*................................................................................ - // mul v10.4s, v24.4s, v1.s[2] // ................*..........................................................................|....................*......................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................*.....................................................................|.........................*.................................................................... - // mls v10.4s, v24.4s, v8.s[0] // ..............................*............................................................|..................................*........................................................... - // sub v24.4s, v11.4s, v12.4s // ...........................................................................................|.*............................................................................................ - // add v11.4s, v11.4s, v12.4s // ...........................................................................................|...*.......................................................................................... - // mul v12.4s, v24.4s, v2.s[0] // .....*.....................................................................................|.........*.................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........*................................................................................|..............*............................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ...................*.......................................................................|.......................*...................................................................... - // sub v24.4s, v13.4s, v14.4s // ..........................e................................................................|..............................e............................................................... - // add v13.4s, v13.4s, v14.4s // .*.........................................................................................|.....*........................................................................................ - // mul v14.4s, v24.4s, v2.s[2] // ...........................................................................................|..*........................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ........................................................................................e..|............................................................................................e. - // mls v14.4s, v24.4s, v8.s[0] // ............*..............................................................................|................*............................................................................. - // sub v24.4s, v15.4s, v16.4s // .............e.............................................................................|.................e............................................................................ - // add v15.4s, v15.4s, v16.4s // ..................e........................................................................|......................e....................................................................... - // mul v16.4s, v24.4s, v3.s[0] // .......................................................................................e...|...........................................................................................e.. - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ....................................................................................e......|........................................................................................e..... - // mls v16.4s, v24.4s, v8.s[0] // ..........................................................................................e|.............................................................................................. - // sub v24.4s, v9.4s, v11.4s // ..............*............................................................................|..................*........................................................................... - // add v9.4s, v9.4s, v11.4s // ...............*...........................................................................|...................*.......................................................................... - // mul v11.4s, v24.4s, v0.s[2] // .........................*.................................................................|.............................*................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................*...............................................................|...............................*.............................................................. - // mls v11.4s, v24.4s, v8.s[0] // ...............................*...........................................................|...................................*.......................................................... - // sub v24.4s, v10.4s, v12.4s // .................................*.........................................................|.....................................*........................................................ - // add v10.4s, v10.4s, v12.4s // ..................................*........................................................|......................................*....................................................... - // mul v12.4s, v24.4s, v0.s[2] // ....................................*......................................................|........................................*..................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................*...................................................|...........................................*.................................................. - // mls v12.4s, v24.4s, v8.s[0] // ............................................*..............................................|................................................*............................................. - // sub v24.4s, v13.4s, v15.4s // ...........*...............................................................................|...............*.............................................................................. - // add v13.4s, v13.4s, v15.4s // .................*.........................................................................|.....................*........................................................................ - // mul v15.4s, v24.4s, v1.s[0] // ............................*..............................................................|................................*............................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................*.............................................................|.................................*............................................................ - // mls v15.4s, v24.4s, v8.s[0] // ................................*..........................................................|....................................*......................................................... - // sub v24.4s, v14.4s, v16.4s // ....................*......................................................................|........................*..................................................................... - // add v14.4s, v14.4s, v16.4s // ......................*....................................................................|..........................*................................................................... - // mul v16.4s, v24.4s, v1.s[0] // ...................................*.......................................................|.......................................*...................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................*.................................................|.............................................*................................................ - // mls v16.4s, v24.4s, v8.s[0] // .............................................*.............................................|.................................................*............................................ - // sub v24.4s, v9.4s, v13.4s // .......................*...................................................................|...........................*.................................................................. - // add v9.4s, v9.4s, v13.4s // ........................*..................................................................|............................*................................................................. - // mul v13.4s, v24.4s, v0.s[0] // ...........................................*...............................................|...............................................*.............................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................*............................................|..................................................*........................................... - // mls v13.4s, v24.4s, v8.s[0] // ....................................................*......................................|........................................................*..................................... - // sub v24.4s, v10.4s, v14.4s // .....................................*.....................................................|.........................................*.................................................... - // add v10.4s, v10.4s, v14.4s // ......................................*....................................................|..........................................*................................................... - // mul v14.4s, v24.4s, v0.s[0] // ...............................................*...........................................|...................................................*.......................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................*.....................................|.........................................................*.................................... - // mls v14.4s, v24.4s, v8.s[0] // .........................................................*.................................|.............................................................*................................ - // sub v24.4s, v11.4s, v15.4s // ........................................*..................................................|............................................*................................................. - // add v11.4s, v11.4s, v15.4s // ..........................................*................................................|..............................................*............................................... - // mul v15.4s, v24.4s, v0.s[0] // ......................................................*....................................|..........................................................*................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................*..................................|............................................................*................................. - // mls v15.4s, v24.4s, v8.s[0] // ...........................................................*...............................|...............................................................*.............................. - // sub v24.4s, v12.4s, v16.4s // ................................................*..........................................|....................................................*......................................... - // add v12.4s, v12.4s, v16.4s // .................................................*.........................................|.....................................................*........................................ - // mul v16.4s, v24.4s, v0.s[0] // ..........................................................*................................|..............................................................*............................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................*.............................|.................................................................*............................ - // mls v16.4s, v24.4s, v8.s[0] // .................................................................*.........................|.....................................................................*........................ - // srshr v24.4S, v13.4S, #23 // .......................................................*...................................|...........................................................*.................................. - // mls v13.4s, v24.4s, v8.4s // ..............................................................*............................|..................................................................*........................... - // srshr v24.4S, v14.4S, #23 // ............................................................*..............................|................................................................*............................. - // mls v14.4s, v24.4s, v8.4s // ................................................................*..........................|....................................................................*......................... - // srshr v24.4S, v15.4S, #23 // ...............................................................*...........................|...................................................................*.......................... - // mls v15.4s, v24.4s, v8.4s // ...................................................................*.......................|.......................................................................*...................... - // srshr v24.4S, v16.4S, #23 // ......................................................................*....................|..........................................................................*................... - // mls v16.4s, v24.4s, v8.4s // .........................................................................*.................|.............................................................................*................ - // str q13, [x0, #(4*(1024/8))] // ..................................................................*........................|......................................................................*....................... - // str q14, [x0, #(5*(1024/8))] // ....................................................................*......................|........................................................................*..................... - // str q15, [x0, #(6*(1024/8))] // ........................................................................*..................|............................................................................*................. - // str q16, [x0, #(7*(1024/8))] // .............................................................................*.............|.................................................................................*............ - // mul v13.4s, v9.4s, v25.4s // ..................................................*........................................|......................................................*....................................... - // sqrdmulh v9.4s, v9.4s, v26.4s // ...................................................*.......................................|.......................................................*...................................... - // mls v13.4s, v9.4s, v8.s[0] // .....................................................................*.....................|.........................................................................*.................... - // mul v14.4s, v10.4s, v25.4s // .......................................................................*...................|...........................................................................*.................. - // sqrdmulh v10.4s, v10.4s, v26.4s // ...........................................................................*...............|...............................................................................*.............. - // mls v14.4s, v10.4s, v8.s[0] // ...............................................................................*...........|...................................................................................*.......... - // mul v15.4s, v11.4s, v25.4s // ............................................................................*..............|................................................................................*............. - // sqrdmulh v11.4s, v11.4s, v26.4s // ..............................................................................*............|..................................................................................*........... - // mls v15.4s, v11.4s, v8.s[0] // ...................................................................................*.......|.......................................................................................*...... - // mul v16.4s, v12.4s, v25.4s // ................................................................................*..........|....................................................................................*......... - // sqrdmulh v12.4s, v12.4s, v26.4s // .................................................................................*.........|.....................................................................................*........ - // mls v16.4s, v12.4s, v8.s[0] // .....................................................................................*.....|.........................................................................................*.... - // str q13, [x0], #(16) // ..........................................................................*................|..............................................................................*............... - // str q14, [x0, #(-16 + 1*(1024/8))] // ..................................................................................*........|......................................................................................*....... - // str q15, [x0, #(-16 + 2*(1024/8))] // ......................................................................................*....|..........................................................................................*... - // str q16, [x0, #(-16 + 3*(1024/8))] // .........................................................................................*.|.............................................................................................* + // ldr q9, [x0, #0] // ...................................................................................................................*...................................................................................................................... + // ldr q10, [x0, #(1*(1024/8))] // ...................................................................................................................|*..................................................................................................................... + // ldr q11, [x0, #(2*(1024/8))] // e..................................................................................................................|....e................................................................................................................. + // ldr q12, [x0, #(3*(1024/8))] // ...e...............................................................................................................|.......e.............................................................................................................. + // ldr q13, [x0, #(4*(1024/8))] // .....e.............................................................................................................|.........e............................................................................................................ + // ldr q14, [x0, #(5*(1024/8))] // ......e............................................................................................................|..........e........................................................................................................... + // ldr q15, [x0, #(6*(1024/8))] // ..e................................................................................................................|......e............................................................................................................... + // ldr q16, [x0, #(7*(1024/8))] // ........e..........................................................................................................|............e......................................................................................................... + // sub v24.4s, v9.4s, v10.4s // .........*.........................................................................................................|.............*........................................................................................................ + // add v9.4s, v9.4s, v10.4s // .......*...........................................................................................................|...........*.......................................................................................................... + // mul v10.4s, v24.4s, v1.s[2] // ................*..................................................................................................|....................*................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................*.............................................................................................|.........................*............................................................................................ + // mls v10.4s, v24.4s, v8.s[0] // ..............................*....................................................................................|..................................*................................................................................... + // sub v24.4s, v11.4s, v12.4s // ...................................................................................................................|.*.................................................................................................................... + // add v11.4s, v11.4s, v12.4s // ...................................................................................................................|..*................................................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ....*..............................................................................................................|........*............................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........*........................................................................................................|..............*....................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................*...............................................................................................|.......................*.............................................................................................. + // sub v24.4s, v13.4s, v14.4s // ..........................e........................................................................................|..............................e....................................................................................... + // add v13.4s, v13.4s, v14.4s // .*.................................................................................................................|.....*................................................................................................................ + // mul v14.4s, v24.4s, v2.s[2] // ...................................................................................................................|...*.................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................................................................................................................e..|....................................................................................................................e. + // mls v14.4s, v24.4s, v8.s[0] // ............*......................................................................................................|................*..................................................................................................... + // sub v24.4s, v15.4s, v16.4s // .................e.................................................................................................|.....................e................................................................................................ + // add v15.4s, v15.4s, v16.4s // ..................e................................................................................................|......................e............................................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ..............................................................................................................e....|..................................................................................................................e... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................................................................................e..........|............................................................................................................e......... + // mls v16.4s, v24.4s, v8.s[0] // ..................................................................................................................e|...................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // .............*.....................................................................................................|.................*.................................................................................................... + // add v9.4s, v9.4s, v11.4s // ..............*....................................................................................................|..................*................................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // .........................*.........................................................................................|.............................*........................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................*.......................................................................................|...............................*...................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ...............................*...................................................................................|...................................*.................................................................................. + // sub v24.4s, v10.4s, v12.4s // .................................*.................................................................................|.....................................*................................................................................ + // add v10.4s, v10.4s, v12.4s // ..................................*................................................................................|......................................*............................................................................... + // mul v12.4s, v24.4s, v0.s[2] // ....................................*..............................................................................|........................................*............................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................*...........................................................................|...........................................*.......................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ............................................*......................................................................|................................................*..................................................................... + // sub v24.4s, v13.4s, v15.4s // ...........*.......................................................................................................|...............*...................................................................................................... + // add v13.4s, v13.4s, v15.4s // ...............*...................................................................................................|...................*.................................................................................................. + // mul v15.4s, v24.4s, v1.s[0] // ............................*......................................................................................|................................*..................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................*.....................................................................................|.................................*.................................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ................................*..................................................................................|....................................*................................................................................. + // sub v24.4s, v14.4s, v16.4s // ....................*..............................................................................................|........................*............................................................................................. + // add v14.4s, v14.4s, v16.4s // ......................*............................................................................................|..........................*........................................................................................... + // mul v16.4s, v24.4s, v1.s[0] // ...................................*...............................................................................|.......................................*.............................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................*.........................................................................|.............................................*........................................................................ + // mls v16.4s, v24.4s, v8.s[0] // .............................................*.....................................................................|.................................................*.................................................................... + // sub v24.4s, v9.4s, v13.4s // .......................*...........................................................................................|...........................*.......................................................................................... + // add v9.4s, v9.4s, v13.4s // ........................*..........................................................................................|............................*......................................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ...........................................*.......................................................................|...............................................*...................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................*....................................................................|..................................................*................................................................... + // mls v13.4s, v24.4s, v8.s[0] // ....................................................*..............................................................|........................................................*............................................................. + // sub v24.4s, v10.4s, v14.4s // .....................................*.............................................................................|.........................................*............................................................................ + // add v10.4s, v10.4s, v14.4s // ......................................*............................................................................|..........................................*........................................................................... + // mul v14.4s, v24.4s, v0.s[0] // ...............................................*...................................................................|...................................................*.................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................*.............................................................|.........................................................*............................................................ + // mls v14.4s, v24.4s, v8.s[0] // ..........................................................*........................................................|..............................................................*....................................................... + // sub v24.4s, v11.4s, v15.4s // ........................................*..........................................................................|............................................*......................................................................... + // add v11.4s, v11.4s, v15.4s // ..........................................*........................................................................|..............................................*....................................................................... + // mul v15.4s, v24.4s, v0.s[0] // ......................................................*............................................................|..........................................................*........................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................*..........................................................|............................................................*......................................................... + // mls v15.4s, v24.4s, v8.s[0] // .............................................................*.....................................................|.................................................................*.................................................... + // sub v24.4s, v12.4s, v16.4s // ................................................*..................................................................|....................................................*................................................................. + // add v12.4s, v12.4s, v16.4s // .................................................*.................................................................|.....................................................*................................................................ + // mul v16.4s, v24.4s, v0.s[0] // ............................................................*......................................................|................................................................*..................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................*...................................................|...................................................................*.................................................. + // mls v16.4s, v24.4s, v8.s[0] // ......................................................................*............................................|..........................................................................*........................................... + // cmge v27.4s, v31.4s, v13.4s // .......................................................*...........................................................|...........................................................*.......................................................... + // cmge v28.4s, v13.4s, v30.4s // .........................................................*.........................................................|.............................................................*........................................................ + // sub v28.4s, v27.4s, v28.4s // ...........................................................*.......................................................|...............................................................*...................................................... + // mls v13.4s, v28.4s, v29.4s // .................................................................*.................................................|.....................................................................*................................................ + // cmge v27.4s, v31.4s, v14.4s // ..............................................................*....................................................|..................................................................*................................................... + // cmge v28.4s, v14.4s, v30.4s // ................................................................*..................................................|....................................................................*................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................*................................................|......................................................................*............................................... + // mls v14.4s, v28.4s, v29.4s // ........................................................................*..........................................|............................................................................*......................................... + // cmge v27.4s, v31.4s, v15.4s // ....................................................................*..............................................|........................................................................*............................................. + // cmge v28.4s, v15.4s, v30.4s // .....................................................................*.............................................|.........................................................................*............................................ + // sub v28.4s, v27.4s, v28.4s // .........................................................................*.........................................|.............................................................................*........................................ + // mls v15.4s, v28.4s, v29.4s // ..............................................................................*....................................|..................................................................................*................................... + // cmge v27.4s, v31.4s, v16.4s // .............................................................................*.....................................|.................................................................................*.................................... + // cmge v28.4s, v16.4s, v30.4s // ...............................................................................*...................................|...................................................................................*.................................. + // sub v28.4s, v27.4s, v28.4s // ...................................................................................*...............................|.......................................................................................*.............................. + // mls v16.4s, v28.4s, v29.4s // ...............................................................................................*...................|...................................................................................................*.................. + // str q13, [x0, #(4*(1024/8))] // .......................................................................*...........................................|...........................................................................*.......................................... + // str q14, [x0, #(5*(1024/8))] // ................................................................................*..................................|....................................................................................*................................. + // str q15, [x0, #(6*(1024/8))] // .....................................................................................*.............................|.........................................................................................*............................ + // str q16, [x0, #(7*(1024/8))] // ......................................................................................................*............|..........................................................................................................*........... + // mul v13.4s, v9.4s, v25.4s // ..................................................*................................................................|......................................................*............................................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ...................................................*...............................................................|.......................................................*.............................................................. + // mls v13.4s, v9.4s, v8.s[0] // ...................................................................*...............................................|.......................................................................*.............................................. + // mul v14.4s, v10.4s, v25.4s // ............................................................................*......................................|................................................................................*..................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ..................................................................................*................................|......................................................................................*............................... + // mls v14.4s, v10.4s, v8.s[0] // .......................................................................................*...........................|...........................................................................................*.......................... + // mul v15.4s, v11.4s, v25.4s // ....................................................................................*..............................|........................................................................................*............................. + // sqrdmulh v11.4s, v11.4s, v26.4s // ......................................................................................*............................|..........................................................................................*........................... + // mls v15.4s, v11.4s, v8.s[0] // .........................................................................................*.........................|.............................................................................................*........................ + // mul v16.4s, v12.4s, v25.4s // ...........................................................................................*.......................|...............................................................................................*...................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ........................................................................................*..........................|............................................................................................*......................... + // mls v16.4s, v12.4s, v8.s[0] // .............................................................................................*.....................|.................................................................................................*.................... + // cmge v27.4s, v31.4s, v13.4s // ..........................................................................*........................................|..............................................................................*....................................... + // cmge v28.4s, v13.4s, v30.4s // ...........................................................................*.......................................|...............................................................................*...................................... + // sub v28.4s, v27.4s, v28.4s // .................................................................................*.................................|.....................................................................................*................................ + // mls v13.4s, v28.4s, v29.4s // ..................................................................................................*................|......................................................................................................*............... + // cmge v27.4s, v31.4s, v14.4s // ..........................................................................................*........................|..............................................................................................*....................... + // cmge v28.4s, v14.4s, v30.4s // ............................................................................................*......................|................................................................................................*..................... + // sub v28.4s, v27.4s, v28.4s // ..............................................................................................*....................|..................................................................................................*................... + // mls v14.4s, v28.4s, v29.4s // ....................................................................................................*..............|........................................................................................................*............. + // cmge v27.4s, v31.4s, v15.4s // ................................................................................................*..................|....................................................................................................*................. + // cmge v28.4s, v15.4s, v30.4s // .................................................................................................*.................|.....................................................................................................*................ + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................*...........|...........................................................................................................*.......... + // mls v15.4s, v28.4s, v29.4s // ...........................................................................................................*.......|...............................................................................................................*...... + // cmge v27.4s, v31.4s, v16.4s // ...................................................................................................*...............|.......................................................................................................*.............. + // cmge v28.4s, v16.4s, v30.4s // .....................................................................................................*.............|.........................................................................................................*............ + // sub v28.4s, v27.4s, v28.4s // ..........................................................................................................*........|..............................................................................................................*....... + // mls v16.4s, v28.4s, v29.4s // .............................................................................................................*.....|.................................................................................................................*.... + // str q13, [x0], #(16) // .........................................................................................................*.........|.............................................................................................................*........ + // str q14, [x0, #(-16 + 1*(1024/8))] // ............................................................................................................*......|................................................................................................................*..... + // str q15, [x0, #(-16 + 2*(1024/8))] // ...............................................................................................................*...|...................................................................................................................*.. + // str q16, [x0, #(-16 + 3*(1024/8))] // .................................................................................................................*.|.....................................................................................................................* sub count, count, #1 cbnz count, layer123_start - sub v28.4S, v6.4S, v29.4S // ..*................................................................................ - mul v17.4S, v30.4S, v2.S[2] // ...*............................................................................... - ldr q21, [x0, #0] // *.................................................................................. - add v14.4S, v6.4S, v29.4S // ....*.............................................................................. - ldr q24, [x0, #128] // .*................................................................................. - // gap // ................................................................................... - add v6.4S, v9.4S, v7.4S // .....*............................................................................. - mls v17.4S, v18.4S, v8.S[0] // ...........*....................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v31.4S, v28.4S, v2.S[0] // ......*............................................................................ - // gap // ................................................................................... - // gap // ................................................................................... - sub v15.4S, v6.4S, v23.4S // ..........*........................................................................ - // gap // ................................................................................... - // gap // ................................................................................... - add v6.4S, v6.4S, v23.4S // ...............*................................................................... - sqrdmulh v28.4S, v28.4S, v2.S[1] // .........*......................................................................... - // gap // ................................................................................... - sub v29.4S, v17.4S, v10.4S // .................*................................................................. - // gap // ................................................................................... - // gap // ................................................................................... - add v17.4S, v17.4S, v10.4S // ...................*............................................................... - mul v13.4S, v15.4S, v1.S[0] // ........................*.......................................................... - // gap // ................................................................................... - add v16.4S, v21.4S, v24.4S // ........*.......................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sub v21.4S, v21.4S, v24.4S // .......*........................................................................... - sqrdmulh v24.4S, v15.4S, v1.S[1] // .........................*......................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v31.4S, v28.4S, v8.S[0] // ................*.................................................................. - sub v28.4S, v16.4S, v14.4S // ............*...................................................................... - // gap // ................................................................................... - add v14.4S, v16.4S, v14.4S // .............*..................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v15.4S, v21.4S, v1.S[2] // ..............*.................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v21.4S, v21.4S, v1.S[3] // ..................*................................................................ - sub v16.4S, v14.4S, v6.4S // ....................*.............................................................. - // gap // ................................................................................... - add v14.4S, v14.4S, v6.4S // .....................*............................................................. - // gap // ................................................................................... - // gap // ................................................................................... - mul v6.4S, v28.4S, v0.S[2] // ......................*............................................................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v28.4S, v28.4S, v0.S[3] // .......................*........................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v9.4S, v29.4S, v1.S[0] // ...............................*................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v29.4S, v29.4S, v1.S[1] // .....................................*............................................. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v15.4S, v21.4S, v8.S[0] // ..........................*........................................................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v13.4S, v24.4S, v8.S[0] // ............................*...................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v6.4S, v28.4S, v8.S[0] // ...........................*....................................................... - // gap // ................................................................................... - // gap // ................................................................................... - add v21.4S, v15.4S, v31.4S // ..............................*.................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sub v24.4S, v15.4S, v31.4S // .............................*..................................................... - mul v28.4S, v16.4S, v0.S[0] // .......................................*........................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v9.4S, v29.4S, v8.S[0] // .........................................*......................................... - sub v31.4S, v21.4S, v17.4S // .................................*................................................. - // gap // ................................................................................... - add v17.4S, v21.4S, v17.4S // ..................................*................................................ - // gap // ................................................................................... - // gap // ................................................................................... - mul v21.4S, v24.4S, v0.S[2] // ................................*.................................................. - sub v15.4S, v6.4S, v13.4S // ....................................*.............................................. - // gap // ................................................................................... - add v6.4S, v6.4S, v13.4S // ......................................*............................................ - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v24.4S, v24.4S, v0.S[3] // ...................................*............................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v29.4S, v16.4S, v0.S[1] // ..........................................*........................................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v13.4S, v31.4S, v0.S[0] // ...........................................*....................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v21.4S, v24.4S, v8.S[0] // ........................................*.......................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v24.4S, v14.4S, v25.4S // ..............................................*.................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v14.4S, v14.4S, v26.4S // ...............................................*................................... - // gap // ................................................................................... - // gap // ................................................................................... - sub v16.4S, v21.4S, v9.4S // ............................................*...................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v28.4S, v29.4S, v8.S[0] // ................................................*.................................. - add v21.4S, v21.4S, v9.4S // .............................................*..................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v31.4S, v31.4S, v0.S[1] // .................................................*................................. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v29.4S, v15.4S, v0.S[0] // ..................................................*................................ - // gap // ................................................................................... - // gap // ................................................................................... - srshr v9.4S, v28.4S, #23 // ...................................................*............................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v15.4S, v15.4S, v0.S[1] // ....................................................*.............................. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v13.4S, v31.4S, v8.S[0] // .....................................................*............................. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v31.4S, v16.4S, v0.S[0] // ......................................................*............................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v29.4S, v15.4S, v8.S[0] // .......................................................*........................... - // gap // ................................................................................... - // gap // ................................................................................... - srshr v15.4S, v13.4S, #23 // ........................................................*.......................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v16.4S, v16.4S, v0.S[1] // .........................................................*......................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v28.4S, v9.4S, v8.4S // ..........................................................*........................ - // gap // ................................................................................... - // gap // ................................................................................... - srshr v9.4S, v29.4S, #23 // ...........................................................*....................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v13.4S, v15.4S, v8.4S // ............................................................*...................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v31.4S, v16.4S, v8.S[0] // .............................................................*..................... - // gap // ................................................................................... - // gap // ................................................................................... - str q28, [x0, #512] // ..............................................................*.................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v29.4S, v9.4S, v8.4S // ...............................................................*................... - // gap // ................................................................................... - // gap // ................................................................................... - str q13, [x0, #640] // ................................................................*.................. - // gap // ................................................................................... - // gap // ................................................................................... - mls v24.4S, v14.4S, v8.S[0] // .................................................................*................. - // gap // ................................................................................... - // gap // ................................................................................... - srshr v14.4S, v31.4S, #23 // ..................................................................*................ - // gap // ................................................................................... - // gap // ................................................................................... - mul v28.4S, v17.4S, v25.4S // ...................................................................*............... - // gap // ................................................................................... - // gap // ................................................................................... - str q29, [x0, #768] // ....................................................................*.............. - // gap // ................................................................................... - // gap // ................................................................................... - mls v31.4S, v14.4S, v8.4S // .....................................................................*............. - // gap // ................................................................................... - // gap // ................................................................................... - str q24, [x0], #(16) // ......................................................................*............ - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v17.4S, v17.4S, v26.4S // .......................................................................*........... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v14.4S, v6.4S, v25.4S // ........................................................................*.......... - // gap // ................................................................................... - // gap // ................................................................................... - str q31, [x0, #880] // .........................................................................*......... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v24.4S, v6.4S, v26.4S // ..........................................................................*........ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v28.4S, v17.4S, v8.S[0] // ...........................................................................*....... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v17.4S, v21.4S, v26.4S // .............................................................................*..... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v21.4S, v21.4S, v25.4S // ............................................................................*...... - // gap // ................................................................................... - // gap // ................................................................................... - str q28, [x0, #112] // ..............................................................................*.... - // gap // ................................................................................... - // gap // ................................................................................... - mls v14.4S, v24.4S, v8.S[0] // ...............................................................................*... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v21.4S, v17.4S, v8.S[0] // ................................................................................*.. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - str q14, [x0, #240] // .................................................................................*. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - str q21, [x0, #368] // ..................................................................................* - // gap // ................................................................................... - // gap // ................................................................................... + mul v19.4S, v20.4S, v2.S[2] // ....*...................................................................................................... + add v5.4S, v22.4S, v4.4S // .....*..................................................................................................... + ldr q9, [x0, #0] // *.......................................................................................................... + ldr q12, [x0, #128] // .*......................................................................................................... + sub v15.4S, v13.4S, v17.4S // ..*........................................................................................................ + // gap // ........................................................................................................... + add v13.4S, v13.4S, v17.4S // ...*....................................................................................................... + mls v19.4S, v11.4S, v8.S[0] // ...........*............................................................................................... + // gap // ........................................................................................................... + sub v18.4S, v5.4S, v23.4S // ..........*................................................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v5.4S, v5.4S, v23.4S // ..............*............................................................................................ + mul v27.4S, v15.4S, v2.S[0] // ......*.................................................................................................... + // gap // ........................................................................................................... + add v17.4S, v9.4S, v12.4S // .......*................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v9.4S, v9.4S, v12.4S // ........*.................................................................................................. + sqrdmulh v12.4S, v15.4S, v2.S[1] // .........*................................................................................................. + // gap // ........................................................................................................... + sub v15.4S, v19.4S, v21.4S // .................*......................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v19.4S, v19.4S, v21.4S // ...................*....................................................................................... + mul v28.4S, v18.4S, v1.S[0] // ........................*.................................................................................. + // gap // ........................................................................................................... + sub v22.4S, v17.4S, v13.4S // ............*.............................................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v13.4S, v17.4S, v13.4S // .............*............................................................................................. + sqrdmulh v18.4S, v18.4S, v1.S[1] // .........................*................................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v17.4S, v9.4S, v1.S[2] // ...............*........................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v4.4S, v13.4S, v5.4S // ....................*...................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v5.4S, v13.4S, v5.4S // .....................*..................................................................................... + mls v27.4S, v12.4S, v8.S[0] // ................*.......................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v9.4S, v9.4S, v1.S[3] // ..................*........................................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v12.4S, v22.4S, v0.S[2] // ......................*.................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v13.4S, v22.4S, v0.S[3] // .......................*................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v22.4S, v15.4S, v1.S[0] // ...............................*........................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v15.4S, v15.4S, v1.S[1] // .....................................*..................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v17.4S, v9.4S, v8.S[0] // ..........................*................................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v28.4S, v18.4S, v8.S[0] // ............................*.............................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v12.4S, v13.4S, v8.S[0] // ...........................*............................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v9.4S, v17.4S, v27.4S // ..............................*............................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v13.4S, v17.4S, v27.4S // .............................*............................................................................. + mul v18.4S, v4.4S, v0.S[0] // .......................................*................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v22.4S, v15.4S, v8.S[0] // .........................................*................................................................. + sub v15.4S, v9.4S, v19.4S // .................................*......................................................................... + // gap // ........................................................................................................... + add v19.4S, v9.4S, v19.4S // ..................................*........................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v9.4S, v13.4S, v0.S[2] // ................................*.......................................................................... + sub v27.4S, v12.4S, v28.4S // ....................................*...................................................................... + // gap // ........................................................................................................... + add v12.4S, v12.4S, v28.4S // ......................................*.................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v13.4S, v13.4S, v0.S[3] // ...................................*....................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v17.4S, v4.4S, v0.S[1] // ..........................................*................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v28.4S, v15.4S, v0.S[0] // ...........................................*............................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v4.4S, v5.4S, v25.4S // ..............................................*............................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v5.4S, v5.4S, v26.4S // ...............................................*........................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v9.4S, v13.4S, v8.S[0] // ........................................*.................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v18.4S, v17.4S, v8.S[0] // ................................................*.......................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v15.4S, v15.4S, v0.S[1] // .................................................*......................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v13.4S, v9.4S, v22.4S // ............................................*.............................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v9.4S, v9.4S, v22.4S // .............................................*............................................................. + mul v17.4S, v27.4S, v0.S[0] // ..................................................*........................................................ + // gap // ........................................................................................................... + cmge v22.4S, v31.4S, v18.4S // ...................................................*....................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v27.4S, v27.4S, v0.S[1] // ....................................................*...................................................... + cmge v16.4S, v18.4S, v30.4S // .....................................................*..................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v28.4S, v15.4S, v8.S[0] // ......................................................*.................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v15.4S, v22.4S, v16.4S // .......................................................*................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v22.4S, v13.4S, v0.S[0] // ........................................................*.................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v17.4S, v27.4S, v8.S[0] // .........................................................*................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v27.4S, v31.4S, v28.4S // ..........................................................*................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v13.4S, v13.4S, v0.S[1] // ...........................................................*............................................... + cmge v16.4S, v28.4S, v30.4S // ............................................................*.............................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v18.4S, v15.4S, v29.4S // .............................................................*............................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v15.4S, v27.4S, v16.4S // ..............................................................*............................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v4.4S, v5.4S, v8.S[0] // ...............................................................*........................................... + cmge v5.4S, v31.4S, v17.4S // ................................................................*.......................................... + // gap // ........................................................................................................... + cmge v27.4S, v17.4S, v30.4S // .................................................................*......................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v22.4S, v13.4S, v8.S[0] // ..................................................................*........................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q18, [x0, #512] // ...................................................................*....................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v28.4S, v15.4S, v29.4S // ....................................................................*...................................... + sub v5.4S, v5.4S, v27.4S // .....................................................................*..................................... + // gap // ........................................................................................................... + cmge v15.4S, v31.4S, v4.4S // ......................................................................*.................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v13.4S, v4.4S, v30.4S // .......................................................................*................................... + mul v18.4S, v19.4S, v25.4S // ........................................................................*.................................. + // gap // ........................................................................................................... + cmge v27.4S, v31.4S, v22.4S // .........................................................................*................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v19.4S, v19.4S, v26.4S // ..............................................................................*............................ + cmge v16.4S, v22.4S, v30.4S // ...........................................................................*............................... + // gap // ........................................................................................................... + str q28, [x0, #640] // ............................................................................*.............................. + sub v15.4S, v15.4S, v13.4S // .............................................................................*............................. + // gap // ........................................................................................................... + mul v13.4S, v12.4S, v25.4S // ................................................................................*.......................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v27.4S, v27.4S, v16.4S // ...............................................................................*........................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v12.4S, v12.4S, v26.4S // ..................................................................................*........................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v18.4S, v19.4S, v8.S[0] // ...................................................................................*....................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v19.4S, v9.4S, v26.4S // ....................................................................................*...................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v13.4S, v12.4S, v8.S[0] // .....................................................................................*..................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v12.4S, v31.4S, v18.4S // ......................................................................................*.................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v9.4S, v9.4S, v25.4S // .......................................................................................*................... + cmge v28.4S, v18.4S, v30.4S // ........................................................................................*.................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v9.4S, v19.4S, v8.S[0] // .........................................................................................*................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v19.4S, v12.4S, v28.4S // ..........................................................................................*................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v17.4S, v5.4S, v29.4S // ..........................................................................*................................ + cmge v5.4S, v31.4S, v13.4S // ............................................................................................*.............. + // gap // ........................................................................................................... + cmge v12.4S, v13.4S, v30.4S // .............................................................................................*............. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v22.4S, v27.4S, v29.4S // ...........................................................................................*............... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v27.4S, v31.4S, v9.4S // ...............................................................................................*........... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v4.4S, v15.4S, v29.4S // ..............................................................................................*............ + cmge v15.4S, v9.4S, v30.4S // .................................................................................................*......... + // gap // ........................................................................................................... + str q17, [x0, #768] // .................................................................................*......................... + sub v5.4S, v5.4S, v12.4S // ...................................................................................................*....... + // gap // ........................................................................................................... + mls v18.4S, v19.4S, v29.4S // ................................................................................................*.......... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q22, [x0, #896] // ..................................................................................................*........ + sub v19.4S, v27.4S, v15.4S // .....................................................................................................*..... + // gap // ........................................................................................................... + mls v13.4S, v5.4S, v29.4S // ......................................................................................................*.... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q4, [x0], #(16) // ....................................................................................................*...... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v9.4S, v19.4S, v29.4S // ........................................................................................................*.. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q18, [x0, #112] // .......................................................................................................*... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q13, [x0, #240] // .........................................................................................................*. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q9, [x0, #368] // ..........................................................................................................* + // gap // ........................................................................................................... + // gap // ........................................................................................................... // original source code - // ldr q17, [x0, #0] // ..*................................................................................ - // ldr q21, [x0, #128] // ....*.............................................................................. - // sub v14.4S, v6.4S, v29.4S // *.................................................................................. - // mul v24.4S, v30.4S, v2.S[2] // .*................................................................................. - // add v28.4S, v6.4S, v29.4S // ...*............................................................................... - // add v31.4S, v9.4S, v7.4S // .....*............................................................................. - // mul v16.4S, v14.4S, v2.S[0] // .......*........................................................................... - // sub v30.4S, v17.4S, v21.4S // ...............*................................................................... - // add v17.4S, v17.4S, v21.4S // ..............*.................................................................... - // sqrdmulh v21.4S, v14.4S, v2.S[1] // ..........*........................................................................ - // sub v14.4S, v31.4S, v23.4S // ........*.......................................................................... - // mls v24.4S, v18.4S, v8.S[0] // ......*............................................................................ - // sub v12.4S, v17.4S, v28.4S // ..................*................................................................ - // add v17.4S, v17.4S, v28.4S // ...................*............................................................... - // mul v28.4S, v30.4S, v1.S[2] // ....................*.............................................................. - // add v31.4S, v31.4S, v23.4S // .........*......................................................................... - // mls v16.4S, v21.4S, v8.S[0] // .................*................................................................. - // sub v21.4S, v24.4S, v10.4S // ...........*....................................................................... - // sqrdmulh v15.4S, v30.4S, v1.S[3] // .....................*............................................................. - // add v24.4S, v24.4S, v10.4S // ............*...................................................................... - // sub v13.4S, v17.4S, v31.4S // ......................*............................................................ - // add v17.4S, v17.4S, v31.4S // .......................*........................................................... - // mul v31.4S, v12.4S, v0.S[2] // ........................*.......................................................... - // sqrdmulh v12.4S, v12.4S, v0.S[3] // .........................*......................................................... - // mul v10.4S, v14.4S, v1.S[0] // .............*..................................................................... - // sqrdmulh v14.4S, v14.4S, v1.S[1] // ................*.................................................................. - // mls v28.4S, v15.4S, v8.S[0] // ............................*...................................................... - // mls v31.4S, v12.4S, v8.S[0] // ..............................*.................................................... - // mls v10.4S, v14.4S, v8.S[0] // .............................*..................................................... - // sub v14.4S, v28.4S, v16.4S // ................................*.................................................. - // add v28.4S, v28.4S, v16.4S // ...............................*................................................... - // mul v15.4S, v21.4S, v1.S[0] // ..........................*........................................................ - // mul v16.4S, v14.4S, v0.S[2] // .....................................*............................................. - // sub v12.4S, v28.4S, v24.4S // ...................................*............................................... - // add v24.4S, v28.4S, v24.4S // ....................................*.............................................. - // sqrdmulh v14.4S, v14.4S, v0.S[3] // ........................................*.......................................... - // sub v28.4S, v31.4S, v10.4S // ......................................*............................................ - // sqrdmulh v21.4S, v21.4S, v1.S[1] // ...........................*....................................................... - // add v31.4S, v31.4S, v10.4S // .......................................*........................................... - // mul v10.4S, v13.4S, v0.S[0] // .................................*................................................. - // mls v16.4S, v14.4S, v8.S[0] // ...........................................*....................................... - // mls v15.4S, v21.4S, v8.S[0] // ..................................*................................................ - // sqrdmulh v21.4S, v13.4S, v0.S[1] // .........................................*......................................... - // mul v14.4S, v12.4S, v0.S[0] // ..........................................*........................................ - // sub v13.4S, v16.4S, v15.4S // ..............................................*.................................... - // add v15.4S, v16.4S, v15.4S // ................................................*.................................. - // mul v16.4S, v17.4S, v25.4S // ............................................*...................................... - // sqrdmulh v17.4S, v17.4S, v26.4S // .............................................*..................................... - // mls v10.4S, v21.4S, v8.S[0] // ...............................................*................................... - // sqrdmulh v21.4S, v12.4S, v0.S[1] // .................................................*................................. - // mul v12.4S, v28.4S, v0.S[0] // ..................................................*................................ - // srshr v4.4S, v10.4S, #23 // ...................................................*............................... - // sqrdmulh v28.4S, v28.4S, v0.S[1] // ....................................................*.............................. - // mls v14.4S, v21.4S, v8.S[0] // .....................................................*............................. - // mul v21.4S, v13.4S, v0.S[0] // ......................................................*............................ - // mls v12.4S, v28.4S, v8.S[0] // .......................................................*........................... - // srshr v28.4S, v14.4S, #23 // ........................................................*.......................... - // sqrdmulh v13.4S, v13.4S, v0.S[1] // .........................................................*......................... - // mls v10.4S, v4.4S, v8.4S // ..........................................................*........................ - // srshr v4.4S, v12.4S, #23 // ...........................................................*....................... - // mls v14.4S, v28.4S, v8.4S // ............................................................*...................... - // mls v21.4S, v13.4S, v8.S[0] // .............................................................*..................... - // str q10, [x0, #512] // ..............................................................*.................... - // mls v12.4S, v4.4S, v8.4S // ...............................................................*................... - // str q14, [x0, #640] // ................................................................*.................. - // mls v16.4S, v17.4S, v8.S[0] // .................................................................*................. - // srshr v17.4S, v21.4S, #23 // ..................................................................*................ - // mul v14.4S, v24.4S, v25.4S // ...................................................................*............... - // str q12, [x0, #768] // ....................................................................*.............. - // mls v21.4S, v17.4S, v8.4S // .....................................................................*............. - // str q16, [x0], #(16) // ......................................................................*............ - // sqrdmulh v17.4S, v24.4S, v26.4S // .......................................................................*........... - // mul v24.4S, v31.4S, v25.4S // ........................................................................*.......... - // str q21, [x0, #880] // .........................................................................*......... - // sqrdmulh v21.4S, v31.4S, v26.4S // ..........................................................................*........ - // mls v14.4S, v17.4S, v8.S[0] // ...........................................................................*....... - // mul v17.4S, v15.4S, v25.4S // .............................................................................*..... - // sqrdmulh v28.4S, v15.4S, v26.4S // ............................................................................*...... - // str q14, [x0, #112] // ..............................................................................*.... - // mls v24.4S, v21.4S, v8.S[0] // ...............................................................................*... - // mls v17.4S, v28.4S, v8.S[0] // ................................................................................*.. - // str q24, [x0, #240] // .................................................................................*. - // str q17, [x0, #368] // ..................................................................................* + // ldr q19, [x0, #0] // ..*........................................................................................................ + // ldr q5, [x0, #128] // ...*....................................................................................................... + // sub v9.4S, v13.4S, v17.4S // ....*...................................................................................................... + // add v15.4S, v13.4S, v17.4S // .....*..................................................................................................... + // mul v12.4S, v20.4S, v2.S[2] // *.......................................................................................................... + // add v18.4S, v22.4S, v4.4S // .*......................................................................................................... + // mul v28.4S, v9.4S, v2.S[0] // .........*................................................................................................. + // add v16.4S, v19.4S, v5.4S // ..........*................................................................................................ + // sub v19.4S, v19.4S, v5.4S // ...........*............................................................................................... + // sqrdmulh v5.4S, v9.4S, v2.S[1] // ............*.............................................................................................. + // sub v9.4S, v18.4S, v23.4S // .......*................................................................................................... + // mls v12.4S, v11.4S, v8.S[0] // ......*.................................................................................................... + // sub v11.4S, v16.4S, v15.4S // ................*.......................................................................................... + // add v15.4S, v16.4S, v15.4S // .................*......................................................................................... + // add v18.4S, v18.4S, v23.4S // ........*.................................................................................................. + // mul v16.4S, v19.4S, v1.S[2] // ...................*....................................................................................... + // mls v28.4S, v5.4S, v8.S[0] // ......................*.................................................................................... + // sub v5.4S, v12.4S, v21.4S // .............*............................................................................................. + // sqrdmulh v19.4S, v19.4S, v1.S[3] // .......................*................................................................................... + // add v12.4S, v12.4S, v21.4S // ..............*............................................................................................ + // sub v27.4S, v15.4S, v18.4S // ....................*...................................................................................... + // add v15.4S, v15.4S, v18.4S // .....................*..................................................................................... + // mul v18.4S, v11.4S, v0.S[2] // ........................*.................................................................................. + // sqrdmulh v11.4S, v11.4S, v0.S[3] // .........................*................................................................................. + // mul v21.4S, v9.4S, v1.S[0] // ...............*........................................................................................... + // sqrdmulh v9.4S, v9.4S, v1.S[1] // ..................*........................................................................................ + // mls v16.4S, v19.4S, v8.S[0] // ............................*.............................................................................. + // mls v18.4S, v11.4S, v8.S[0] // ..............................*............................................................................ + // mls v21.4S, v9.4S, v8.S[0] // .............................*............................................................................. + // sub v19.4S, v16.4S, v28.4S // ................................*.......................................................................... + // add v9.4S, v16.4S, v28.4S // ...............................*........................................................................... + // mul v28.4S, v5.4S, v1.S[0] // ..........................*................................................................................ + // mul v16.4S, v19.4S, v0.S[2] // .....................................*..................................................................... + // sub v11.4S, v9.4S, v12.4S // ...................................*....................................................................... + // add v9.4S, v9.4S, v12.4S // ....................................*...................................................................... + // sqrdmulh v19.4S, v19.4S, v0.S[3] // ........................................*.................................................................. + // sub v12.4S, v18.4S, v21.4S // ......................................*.................................................................... + // sqrdmulh v5.4S, v5.4S, v1.S[1] // ...........................*............................................................................... + // add v18.4S, v18.4S, v21.4S // .......................................*................................................................... + // mul v21.4S, v27.4S, v0.S[0] // .................................*......................................................................... + // mls v16.4S, v19.4S, v8.S[0] // .............................................*............................................................. + // mls v28.4S, v5.4S, v8.S[0] // ..................................*........................................................................ + // sqrdmulh v19.4S, v27.4S, v0.S[1] // .........................................*................................................................. + // mul v5.4S, v11.4S, v0.S[0] // ..........................................*................................................................ + // sub v27.4S, v16.4S, v28.4S // ................................................*.......................................................... + // add v28.4S, v16.4S, v28.4S // .................................................*......................................................... + // mul v16.4S, v15.4S, v25.4S // ...........................................*............................................................... + // sqrdmulh v15.4S, v15.4S, v26.4S // ............................................*.............................................................. + // mls v21.4S, v19.4S, v8.S[0] // ..............................................*............................................................ + // sqrdmulh v19.4S, v11.4S, v0.S[1] // ...............................................*........................................................... + // mul v11.4S, v12.4S, v0.S[0] // ..................................................*........................................................ + // cmge v24.4S, v31.4S, v21.4S // ...................................................*....................................................... + // sqrdmulh v12.4S, v12.4S, v0.S[1] // ....................................................*...................................................... + // cmge v7.4S, v21.4S, v30.4S // .....................................................*..................................................... + // mls v5.4S, v19.4S, v8.S[0] // ......................................................*.................................................... + // sub v19.4S, v24.4S, v7.4S // .......................................................*................................................... + // mul v24.4S, v27.4S, v0.S[0] // ........................................................*.................................................. + // mls v11.4S, v12.4S, v8.S[0] // .........................................................*................................................. + // cmge v12.4S, v31.4S, v5.4S // ..........................................................*................................................ + // sqrdmulh v27.4S, v27.4S, v0.S[1] // ...........................................................*............................................... + // cmge v7.4S, v5.4S, v30.4S // ............................................................*.............................................. + // mls v21.4S, v19.4S, v29.4S // .............................................................*............................................. + // sub v19.4S, v12.4S, v7.4S // ..............................................................*............................................ + // mls v16.4S, v15.4S, v8.S[0] // ...............................................................*........................................... + // cmge v12.4S, v31.4S, v11.4S // ................................................................*.......................................... + // cmge v15.4S, v11.4S, v30.4S // .................................................................*......................................... + // mls v24.4S, v27.4S, v8.S[0] // ..................................................................*........................................ + // str q21, [x0, #512] // ...................................................................*....................................... + // mls v5.4S, v19.4S, v29.4S // ....................................................................*...................................... + // sub v19.4S, v12.4S, v15.4S // .....................................................................*..................................... + // cmge v12.4S, v31.4S, v16.4S // ......................................................................*.................................... + // cmge v15.4S, v16.4S, v30.4S // .......................................................................*................................... + // mul v27.4S, v9.4S, v25.4S // ........................................................................*.................................. + // cmge v21.4S, v31.4S, v24.4S // .........................................................................*................................. + // mls v11.4S, v19.4S, v29.4S // .........................................................................................*................. + // cmge v19.4S, v24.4S, v30.4S // ...........................................................................*............................... + // str q5, [x0, #640] // ............................................................................*.............................. + // sub v5.4S, v12.4S, v15.4S // .............................................................................*............................. + // sqrdmulh v9.4S, v9.4S, v26.4S // ..........................................................................*................................ + // sub v19.4S, v21.4S, v19.4S // ...............................................................................*........................... + // mul v12.4S, v18.4S, v25.4S // ..............................................................................*............................ + // str q11, [x0, #768] // ................................................................................................*.......... + // sqrdmulh v15.4S, v18.4S, v26.4S // ................................................................................*.......................... + // mls v27.4S, v9.4S, v8.S[0] // .................................................................................*......................... + // sqrdmulh v9.4S, v28.4S, v26.4S // ..................................................................................*........................ + // mls v12.4S, v15.4S, v8.S[0] // ...................................................................................*....................... + // cmge v15.4S, v31.4S, v27.4S // ....................................................................................*...................... + // mul v18.4S, v28.4S, v25.4S // .....................................................................................*..................... + // cmge v28.4S, v27.4S, v30.4S // ......................................................................................*.................... + // mls v18.4S, v9.4S, v8.S[0] // .......................................................................................*................... + // sub v9.4S, v15.4S, v28.4S // ........................................................................................*.................. + // mls v24.4S, v19.4S, v29.4S // ............................................................................................*.............. + // cmge v19.4S, v31.4S, v12.4S // ..........................................................................................*................ + // cmge v15.4S, v12.4S, v30.4S // ...........................................................................................*............... + // mls v16.4S, v5.4S, v29.4S // ..............................................................................................*............ + // cmge v5.4S, v31.4S, v18.4S // .............................................................................................*............. + // mls v27.4S, v9.4S, v29.4S // ..................................................................................................*........ + // cmge v9.4S, v18.4S, v30.4S // ...............................................................................................*........... + // str q24, [x0, #896] // ...................................................................................................*....... + // sub v19.4S, v19.4S, v15.4S // .................................................................................................*......... + // str q16, [x0], #(16) // ......................................................................................................*.... + // sub v5.4S, v5.4S, v9.4S // ....................................................................................................*...... + // mls v12.4S, v19.4S, v29.4S // .....................................................................................................*..... + // str q27, [x0, #112] // ........................................................................................................*.. + // mls v18.4S, v5.4S, v29.4S // .......................................................................................................*... + // str q12, [x0, #240] // .........................................................................................................*. + // str q18, [x0, #368] // ..........................................................................................................* pop_stack diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s index 9d7b69c..0c486d5 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s @@ -371,6 +371,8 @@ _intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm: consts .req v8 qform_consts .req q8 + modulus .req v29 + ASM_LOAD(r_ptr0, roots_l345) ASM_LOAD(r_ptr1, roots_l67) @@ -393,1899 +395,2072 @@ _intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm: qform_root3_tw .req q7 .p2align 2 - ldr q30, [x5, #16] // *..... - ldr q18, [x5, #128] // .*.... - ldr q0, [x1, #48] // ..*... - // gap // ...... - // gap // ...... - // gap // ...... - // gap // ...... - // gap // ...... - ldr q20, [x1, #32] // ...*.. - ldr q1, [x1, #16] // ....*. - ldr q27, [x1, #0] // .....* - // gap // ...... - // gap // ...... - // gap // ...... - // gap // ...... - // gap // ...... + ldr q0, [x1, #32] // *. + ldr q18, [x1, #48] // .* + // gap // .. + // gap // .. + // gap // .. + // gap // .. + // gap // .. + // gap // .. // original source code - // ldr q30, [x5, #16] // *..... - // ldr q18, [x5, #128] // .*.... - // ldr q0, [x1, #48] // ..*... - // ldr q20, [x1, #32] // ...*.. - // ldr q1, [x1, #16] // ....*. - // ldr q27, [x1, #0] // .....* + // ldr q0, [x1, #32] // *. + // ldr q18, [x1, #48] // .* sub count, count, #1 layer45678_start: - // gap // ...................................................................................................................................................................... - ldr q13, [x2, #48] // ...............*...................................................................................................................................................... - ldr q19, [x2, #16] // .............*........................................................................................................................................................ - trn1 v25.4S, v20.4S, v0.4S // ......*............................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q5, [x2, #0] // ............*......................................................................................................................................................... - ldr q31, [x2, #32] // ..............*....................................................................................................................................................... - trn1 v7.4S, v27.4S, v1.4S // ....*................................................................................................................................................................. - trn2 v27.4S, v27.4S, v1.4S // .....*................................................................................................................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q1, [x5, #176] // .......................................................*.............................................................................................................. - trn2 v14.4S, v20.4S, v0.4S // .......*.............................................................................................................................................................. - ldr q12, [x5, #144] // .....................................................*................................................................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn2 v10.2D, v7.2D, v25.2D // ........*............................................................................................................................................................. - trn1 v29.2D, v7.2D, v25.2D // ..........*........................................................................................................................................................... - ldr q11, [x5, #80] // .............................*........................................................................................................................................ - ldr q9, [x5, #32] // ..........................*........................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q28, [x5, #64] // ............................*......................................................................................................................................... - ldr q22, [x5, #48] // ...........................*.......................................................................................................................................... - trn2 v24.2D, v27.2D, v14.2D // .........*............................................................................................................................................................ - trn1 v21.2D, v27.2D, v14.2D // ...........*.......................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v14.4S, v31.4S, v13.4S // ..................*................................................................................................................................................... - trn2 v15.4S, v31.4S, v13.4S // ...................*.................................................................................................................................................. - trn2 v13.4S, v5.4S, v19.4S // .................*.................................................................................................................................................... - trn1 v3.4S, v5.4S, v19.4S // ................*..................................................................................................................................................... - ldr q5, [x5, #160] // ......................................................*............................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v2.4S, v29.4S, v21.4S // ..............................*....................................................................................................................................... - sub v16.4S, v10.4S, v24.4S // ...................................*.................................................................................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v0.2D, v3.2D, v14.2D // ......................*............................................................................................................................................... - trn2 v23.2D, v3.2D, v14.2D // ....................*................................................................................................................................................. - trn2 v6.2D, v13.2D, v15.2D // .....................*................................................................................................................................................ - trn1 v26.2D, v13.2D, v15.2D // .......................*.............................................................................................................................................. - ldr q31, [x5, #112] // ...................................................*.................................................................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v27.4S, v16.4S, v28.4S // .....................................*................................................................................................................................ - mul v14.4S, v2.4S, v9.4S // ................................*..................................................................................................................................... - sqrdmulh v13.4S, v2.4S, v22.4S // .................................*.................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - add v15.4S, v0.4S, v26.4S // .........................................................*............................................................................................................ - add v9.4S, v29.4S, v21.4S // ...............................*...................................................................................................................................... - sub v19.4S, v0.4S, v26.4S // ........................................................*............................................................................................................. - ldr q20, [x5], #(12*16) // ........................*............................................................................................................................................. - sub v28.4S, v23.4S, v6.4S // .............................................................*........................................................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v25.4S, v16.4S, v11.4S // ......................................*............................................................................................................................... - add v2.4S, v10.4S, v24.4S // ....................................*................................................................................................................................. - add v16.4S, v23.4S, v6.4S // ..............................................................*....................................................................................................... - ldr q10, [x4, #48] // ...............................................................................................*...................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v21.4S, v28.4S, v5.4S // ...............................................................*...................................................................................................... - sqrdmulh v3.4S, v28.4S, v1.4S // ................................................................*..................................................................................................... - sqrdmulh v4.4S, v19.4S, v12.4S // ...........................................................*.......................................................................................................... - mul v22.4S, v19.4S, v18.4S // ..........................................................*........................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - add v5.4S, v15.4S, v16.4S // ...................................................................*.................................................................................................. - sub v15.4S, v15.4S, v16.4S // ..................................................................*................................................................................................... - ldr q26, [x5, #-96] // ..................................................*................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v27.4S, v25.4S, v8.S[0] // .......................................*.............................................................................................................................. - mls v14.4S, v13.4S, v8.S[0] // ..................................*................................................................................................................................... - sub v12.4S, v9.4S, v2.4S // ........................................*............................................................................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v22.4S, v4.4S, v8.S[0] // ............................................................*......................................................................................................... - mls v21.4S, v3.4S, v8.S[0] // .................................................................*.................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - add v28.4S, v9.4S, v2.4S // .........................................*............................................................................................................................ - mul v19.4S, v12.4S, v20.4S // ..........................................*........................................................................................................................... - sqrdmulh v11.4S, v12.4S, v30.4S // ...........................................*.......................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - add v0.4S, v14.4S, v27.4S // ..............................................*....................................................................................................................... - sub v6.4S, v14.4S, v27.4S // .............................................*........................................................................................................................ - sqrdmulh v14.4S, v15.4S, v31.4S // .....................................................................*................................................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v29.4S, v22.4S, v21.4S // .......................................................................*.............................................................................................. - add v1.4S, v22.4S, v21.4S // ........................................................................*............................................................................................. - mul v9.4S, v15.4S, v26.4S // ....................................................................*................................................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v24.4S, v28.4S, v0.4S // ............................................................................*......................................................................................... - mul v25.4S, v6.4S, v20.4S // ...............................................*...................................................................................................................... - sqrdmulh v17.4S, v6.4S, v30.4S // ................................................*..................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v19.4S, v11.4S, v8.S[0] // ............................................*......................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn2 v22.4S, v5.4S, v1.4S // .....................................................................................*................................................................................ - mul v4.4S, v29.4S, v26.4S // .........................................................................*............................................................................................ - sqrdmulh v26.4S, v29.4S, v31.4S // ..........................................................................*........................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn2 v21.4S, v28.4S, v0.4S // .............................................................................*........................................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v25.4S, v17.4S, v8.S[0] // .................................................*.................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v4.4S, v26.4S, v8.S[0] // ...........................................................................*.......................................................................................... - // gap // ...................................................................................................................................................................... - mls v9.4S, v14.4S, v8.S[0] // ......................................................................*............................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v7.4S, v5.4S, v1.4S // ....................................................................................*................................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v30.4S, v19.4S, v25.4S // ..............................................................................*....................................................................................... - trn2 v11.4S, v19.4S, v25.4S // ...............................................................................*...................................................................................... - ldr q25, [x4, #16] // .............................................................................................*........................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn2 v27.4S, v9.4S, v4.4S // .......................................................................................*.............................................................................. - trn1 v0.4S, v9.4S, v4.4S // ......................................................................................*............................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn2 v14.2D, v24.2D, v30.2D // ................................................................................*..................................................................................... - trn2 v28.2D, v21.2D, v11.2D // .................................................................................*.................................................................................... - trn1 v29.2D, v24.2D, v30.2D // ..................................................................................*................................................................................... - trn1 v17.2D, v21.2D, v11.2D // ...................................................................................*.................................................................................. - ldr q11, [x4, #32] // ..............................................................................................*....................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - trn1 v1.2D, v7.2D, v0.2D // ..........................................................................................*........................................................................... - trn1 v12.2D, v22.2D, v27.2D // ...........................................................................................*.......................................................................... - trn2 v30.2D, v7.2D, v0.2D // ........................................................................................*............................................................................. - trn2 v13.2D, v22.2D, v27.2D // .........................................................................................*............................................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v31.4S, v14.4S, v28.4S // .....................................................................................................*................................................................ - sub v20.4S, v29.4S, v17.4S // ................................................................................................*..................................................................... - add v3.4S, v14.4S, v28.4S // ......................................................................................................*............................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v18.4S, v1.4S, v12.4S // ..........................................................................................................*........................................................... - sub v9.4S, v30.4S, v13.4S // ...............................................................................................................*...................................................... - add v0.4S, v1.4S, v12.4S // ...........................................................................................................*.......................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v22.4S, v31.4S, v11.S[0] // .......................................................................................................*.............................................................. - sqrdmulh v4.4S, v31.4S, v11.S[1] // ........................................................................................................*............................................................. - mul v6.4S, v20.4S, v25.S[2] // ..................................................................................................*................................................................... - sqrdmulh v27.4S, v20.4S, v25.S[3] // ...................................................................................................*.................................................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v16.4S, v18.4S, v11.S[2] // ............................................................................................................*......................................................... - mul v26.4S, v9.4S, v10.S[0] // .................................................................................................................*.................................................... - sqrdmulh v20.4S, v18.4S, v11.S[3] // .............................................................................................................*........................................................ - sqrdmulh v18.4S, v9.4S, v10.S[1] // ..................................................................................................................*................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - add v14.4S, v29.4S, v17.4S // .................................................................................................*.................................................................... - add v10.4S, v30.4S, v13.4S // ................................................................................................................*..................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v22.4S, v4.4S, v8.S[0] // .........................................................................................................*............................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v30.4S, v14.4S, v3.4S // ....................................................................................................................*................................................. - mls v6.4S, v27.4S, v8.S[0] // ....................................................................................................*................................................................. - mls v26.4S, v18.4S, v8.S[0] // ...................................................................................................................*.................................................. - mls v16.4S, v20.4S, v8.S[0] // ..............................................................................................................*....................................................... - ldr q18, [x4], #64 // ............................................................................................*......................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v13.4S, v0.4S, v10.4S // ..............................................................................................................................*....................................... - // gap // ...................................................................................................................................................................... - add v3.4S, v14.4S, v3.4S // .....................................................................................................................*................................................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - add v31.4S, v0.4S, v10.4S // ...............................................................................................................................*...................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v17.4S, v16.4S, v26.4S // ...................................................................................................................................*.................................. - sub v29.4S, v6.4S, v22.4S // .........................................................................................................................*............................................ - mul v9.4S, v13.4S, v25.S[0] // ................................................................................................................................*..................................... - sqrdmulh v14.4S, v13.4S, v25.S[1] // .................................................................................................................................*.................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v10.4S, v30.4S, v18.S[2] // ......................................................................................................................*............................................... - sqrdmulh v19.4S, v30.4S, v18.S[3] // .......................................................................................................................*.............................................. - ldr q30, [x5, #16] // .........................e............................................................................................................................................ - sub v1.4S, v3.4S, v31.4S // ........................................................................................................................................*............................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sqrdmulh v0.4S, v17.4S, v25.S[1] // ......................................................................................................................................*............................... - mul v25.4S, v17.4S, v25.S[0] // .....................................................................................................................................*................................ - mul v27.4S, v29.4S, v18.S[2] // ...........................................................................................................................*.......................................... - sqrdmulh v20.4S, v29.4S, v18.S[3] // ............................................................................................................................*......................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v9.4S, v14.4S, v8.S[0] // ..................................................................................................................................*................................... - add v3.4S, v3.4S, v31.4S // .........................................................................................................................................*............................ - mul v4.4S, v1.4S, v18.S[0] // ..........................................................................................................................................*........................... - add v7.4S, v6.4S, v22.4S // ..........................................................................................................................*........................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v10.4S, v19.4S, v8.S[0] // ........................................................................................................................*............................................. - sqrdmulh v24.4S, v1.4S, v18.S[1] // ...........................................................................................................................................*.......................... - add v19.4S, v16.4S, v26.4S // ....................................................................................................................................*................................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v25.4S, v0.4S, v8.S[0] // .......................................................................................................................................*.............................. - mls v27.4S, v20.4S, v8.S[0] // .............................................................................................................................*........................................ - str q3, [x1], #(16*4) // ............................................................................................................................................................*......... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - add v2.4S, v7.4S, v19.4S // ..............................................................................................................................................*....................... - sub v13.4S, v7.4S, v19.4S // .............................................................................................................................................*........................ - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v5.4S, v10.4S, v9.4S // ..................................................................................................................................................*................... - add v19.4S, v10.4S, v9.4S // ...................................................................................................................................................*.................. - mls v4.4S, v24.4S, v8.S[0] // ............................................................................................................................................*......................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - sub v9.4S, v27.4S, v25.4S // .......................................................................................................................................................*.............. - add v31.4S, v27.4S, v25.4S // ........................................................................................................................................................*............. - str q2, [x1, #-48] // .............................................................................................................................................................*........ - sqrdmulh v14.4S, v13.4S, v18.S[1] // ................................................................................................................................................*..................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mul v17.4S, v13.4S, v18.S[0] // ...............................................................................................................................................*...................... - str q19, [x1, #-32] // ..............................................................................................................................................................*....... - mul v19.4S, v5.4S, v18.S[0] // ....................................................................................................................................................*................. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - str q31, [x1, #-16] // ...............................................................................................................................................................*...... - sqrdmulh v31.4S, v5.4S, v18.S[1] // .....................................................................................................................................................*................ - mul v28.4S, v9.4S, v18.S[0] // .........................................................................................................................................................*............ - sqrdmulh v5.4S, v9.4S, v18.S[1] // ..........................................................................................................................................................*........... - ldr q18, [x5, #128] // ....................................................e................................................................................................................. - add x1, x1, #64 // ....................................................................................................................................................................*. - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - str q4, [x2], #(16*4) // ................................................................................................................................................................*..... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q0, [x1, #48] // ...e.................................................................................................................................................................. - mls v17.4S, v14.4S, v8.S[0] // .................................................................................................................................................*.................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - mls v19.4S, v31.4S, v8.S[0] // ......................................................................................................................................................*............... - mls v28.4S, v5.4S, v8.S[0] // ...........................................................................................................................................................*.......... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - ldr q20, [x1, #32] // ..e................................................................................................................................................................... - ldr q1, [x1, #16] // .e.................................................................................................................................................................... - ldr q27, [x1, #0] // e..................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - str q17, [x2, #-48] // .................................................................................................................................................................*.... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - str q19, [x2, #-32] // ..................................................................................................................................................................*... - str q28, [x2, #-16] // ...................................................................................................................................................................*.. - add x2, x2, #64 // .....................................................................................................................................................................* - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... - // gap // ...................................................................................................................................................................... + ldr q29, [x1, #0] // *............................................................................................................................................................................. + ldr q19, [x1, #16] // .*............................................................................................................................................................................ + ldr q5, [x2, #0] // ............*................................................................................................................................................................. + trn1 v9.4S, v0.4S, v18.4S // ......*....................................................................................................................................................................... + trn2 v0.4S, v0.4S, v18.4S // .......*...................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q2, [x2, #16] // .............*................................................................................................................................................................ + ldr q12, [x2, #32] // ..............*............................................................................................................................................................... + ldr q15, [x2, #48] // ...............*.............................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q13, [x5, #32] // ..........................*................................................................................................................................................... + ldr q18, [x5], #(12*16) // ........................*..................................................................................................................................................... + ldr q27, [x5, #-176] // .........................*.................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q17, [x5, #-144] // ...........................*.................................................................................................................................................. + ldr q31, [x5, #-128] // ............................*................................................................................................................................................. + ldr q25, [x5, #-112] // .............................*................................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v28.4S, v29.4S, v19.4S // ....*......................................................................................................................................................................... + trn2 v29.4S, v29.4S, v19.4S // .....*........................................................................................................................................................................ + ldr q19, [x5, #-64] // ....................................................*......................................................................................................................... + ldr q3, [x5, #-96] // ..................................................*........................................................................................................................... + ldr q22, [x5, #-80] // ...................................................*.......................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v1.4S, v5.4S, v2.4S // ................*............................................................................................................................................................. + trn2 v5.4S, v5.4S, v2.4S // .................*............................................................................................................................................................ + trn1 v2.4S, v12.4S, v15.4S // ..................*........................................................................................................................................................... + trn2 v12.4S, v12.4S, v15.4S // ...................*.......................................................................................................................................................... + ldr q15, [x5, #-48] // .....................................................*........................................................................................................................ + ldr q4, [x5, #-32] // ......................................................*....................................................................................................................... + ldr q16, [x5, #-16] // .......................................................*...................................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v20.2D, v28.2D, v9.2D // ........*..................................................................................................................................................................... + trn1 v9.2D, v28.2D, v9.2D // ..........*................................................................................................................................................................... + trn2 v28.2D, v29.2D, v0.2D // .........*.................................................................................................................................................................... + trn1 v29.2D, v29.2D, v0.2D // ...........*.................................................................................................................................................................. + ldr q0, [x4, #32] // ..............................................................................................*............................................................................... + ldr q11, [x4, #16] // .............................................................................................*................................................................................ + ldr q6, [x4], #64 // ............................................................................................*................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v23.2D, v1.2D, v2.2D // ....................*......................................................................................................................................................... + trn1 v2.2D, v1.2D, v2.2D // ......................*....................................................................................................................................................... + trn2 v1.2D, v5.2D, v12.2D // .....................*........................................................................................................................................................ + trn1 v5.2D, v5.2D, v12.2D // .......................*...................................................................................................................................................... + ldr q12, [x4, #-16] // ...............................................................................................*.............................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v21.4S, v20.4S, v28.4S // ...................................*.......................................................................................................................................... + add v28.4S, v20.4S, v28.4S // ....................................*......................................................................................................................................... + sub v20.4S, v9.4S, v29.4S // ..............................*............................................................................................................................................... + add v29.4S, v9.4S, v29.4S // ...............................*.............................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v9.4S, v23.4S, v1.4S // .............................................................*................................................................................................................ + add v1.4S, v23.4S, v1.4S // ..............................................................*............................................................................................................... + sub v23.4S, v2.4S, v5.4S // ........................................................*..................................................................................................................... + add v5.4S, v2.4S, v5.4S // .........................................................*.................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v2.4S, v20.4S, v13.4S // ................................*............................................................................................................................................. + sqrdmulh v13.4S, v20.4S, v17.4S // .................................*............................................................................................................................................ + mul v17.4S, v21.4S, v31.4S // .....................................*........................................................................................................................................ + sqrdmulh v31.4S, v21.4S, v25.4S // ......................................*....................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v19.4S, v23.4S, v19.4S // ..........................................................*................................................................................................................... + sqrdmulh v15.4S, v23.4S, v15.4S // ...........................................................*.................................................................................................................. + mul v25.4S, v9.4S, v4.4S // ...............................................................*.............................................................................................................. + sqrdmulh v9.4S, v9.4S, v16.4S // ................................................................*............................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v4.4S, v5.4S, v1.4S // ..................................................................*........................................................................................................... + add v5.4S, v5.4S, v1.4S // ...................................................................*.......................................................................................................... + sub v1.4S, v29.4S, v28.4S // ........................................*..................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v29.4S, v29.4S, v28.4S // .........................................*.................................................................................................................................... + mls v2.4S, v13.4S, v8.S[0] // ..................................*........................................................................................................................................... + mls v17.4S, v31.4S, v8.S[0] // .......................................*...................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v19.4S, v15.4S, v8.S[0] // ............................................................*................................................................................................................. + mls v25.4S, v9.4S, v8.S[0] // .................................................................*............................................................................................................ + mul v9.4S, v1.4S, v18.4S // ..........................................*................................................................................................................................... + sqrdmulh v15.4S, v1.4S, v27.4S // ...........................................*.................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v13.4S, v4.4S, v3.4S // ....................................................................*......................................................................................................... + sqrdmulh v31.4S, v4.4S, v22.4S // .....................................................................*........................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v28.4S, v2.4S, v17.4S // .............................................*................................................................................................................................ + add v2.4S, v2.4S, v17.4S // ..............................................*............................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v17.4S, v19.4S, v25.4S // .......................................................................*...................................................................................................... + add v19.4S, v19.4S, v25.4S // ........................................................................*..................................................................................................... + mls v9.4S, v15.4S, v8.S[0] // ............................................*................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v15.4S, v28.4S, v18.4S // ...............................................*.............................................................................................................................. + sqrdmulh v18.4S, v28.4S, v27.4S // ................................................*............................................................................................................................. + mls v13.4S, v31.4S, v8.S[0] // ......................................................................*....................................................................................................... + trn1 v27.4S, v29.4S, v2.4S // ............................................................................*................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v31.4S, v17.4S, v3.4S // .........................................................................*.................................................................................................... + sqrdmulh v17.4S, v17.4S, v22.4S // ..........................................................................*................................................................................................... + trn2 v29.4S, v29.4S, v2.4S // .............................................................................*................................................................................................ + trn1 v2.4S, v5.4S, v19.4S // ....................................................................................*......................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v19.4S, v5.4S, v19.4S // .....................................................................................*........................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v15.4S, v18.4S, v8.S[0] // .................................................*............................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v31.4S, v17.4S, v8.S[0] // ...........................................................................*.................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v5.4S, v9.4S, v15.4S // ..............................................................................*............................................................................................... + trn2 v9.4S, v9.4S, v15.4S // ...............................................................................*.............................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v15.4S, v13.4S, v31.4S // ......................................................................................*....................................................................................... + trn2 v13.4S, v13.4S, v31.4S // .......................................................................................*...................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v18.2D, v27.2D, v5.2D // ................................................................................*............................................................................................. + trn1 v5.2D, v27.2D, v5.2D // ..................................................................................*........................................................................................... + trn2 v27.2D, v29.2D, v9.2D // .................................................................................*............................................................................................ + trn1 v29.2D, v29.2D, v9.2D // ...................................................................................*.......................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v9.2D, v2.2D, v15.2D // ........................................................................................*..................................................................................... + trn1 v2.2D, v2.2D, v15.2D // ..........................................................................................*................................................................................... + trn2 v15.2D, v19.2D, v13.2D // .........................................................................................*.................................................................................... + trn1 v19.2D, v19.2D, v13.2D // ...........................................................................................*.................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v13.4S, v18.4S, v27.4S // .....................................................................................................*........................................................................ + add v18.4S, v18.4S, v27.4S // ......................................................................................................*....................................................................... + sub v27.4S, v5.4S, v29.4S // ................................................................................................*............................................................................. + add v29.4S, v5.4S, v29.4S // .................................................................................................*............................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v5.4S, v9.4S, v15.4S // ...............................................................................................................*.............................................................. + add v15.4S, v9.4S, v15.4S // ................................................................................................................*............................................................. + sub v17.4S, v2.4S, v19.4S // ..........................................................................................................*................................................................... + add v19.4S, v2.4S, v19.4S // ...........................................................................................................*.................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v2.4S, v13.4S, v0.S[0] // .......................................................................................................*...................................................................... + sqrdmulh v13.4S, v13.4S, v0.S[1] // ........................................................................................................*..................................................................... + mul v9.4S, v27.4S, v11.S[2] // ..................................................................................................*........................................................................... + sqrdmulh v27.4S, v27.4S, v11.S[3] // ...................................................................................................*.......................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v31.4S, v17.4S, v0.S[2] // ............................................................................................................*................................................................. + sqrdmulh v0.4S, v17.4S, v0.S[3] // .............................................................................................................*................................................................ + mul v17.4S, v5.4S, v12.S[0] // .................................................................................................................*............................................................ + sqrdmulh v5.4S, v5.4S, v12.S[1] // ..................................................................................................................*........................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v12.4S, v29.4S, v18.4S // ....................................................................................................................*......................................................... + add v29.4S, v29.4S, v18.4S // .....................................................................................................................*........................................................ + sub v18.4S, v19.4S, v15.4S // ..............................................................................................................................*............................................... + add v19.4S, v19.4S, v15.4S // ...............................................................................................................................*.............................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v9.4S, v27.4S, v8.S[0] // ....................................................................................................*......................................................................... + mls v2.4S, v13.4S, v8.S[0] // .........................................................................................................*.................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v31.4S, v0.4S, v8.S[0] // ..............................................................................................................*............................................................... + mls v17.4S, v5.4S, v8.S[0] // ...................................................................................................................*.......................................................... + mul v5.4S, v12.4S, v6.S[2] // ......................................................................................................................*....................................................... + sqrdmulh v0.4S, v12.4S, v6.S[3] // .......................................................................................................................*...................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v12.4S, v18.4S, v11.S[0] // ................................................................................................................................*............................................. + sqrdmulh v15.4S, v18.4S, v11.S[1] // .................................................................................................................................*............................................ + srshr v13.4S, v29.4S, #23 // ........................................................................................................................................*..................................... + srshr v18.4S, v19.4S, #23 // ............................................................................................................................................*................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v27.4S, v9.4S, v2.4S // .........................................................................................................................*.................................................... + add v9.4S, v9.4S, v2.4S // ..........................................................................................................................*................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v2.4S, v31.4S, v17.4S // ...................................................................................................................................*.......................................... + add v17.4S, v31.4S, v17.4S // ....................................................................................................................................*......................................... + mls v5.4S, v0.4S, v8.S[0] // ........................................................................................................................*..................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v29.4S, v13.4S, v8.4S // .........................................................................................................................................*.................................... + mls v12.4S, v15.4S, v8.S[0] // ..................................................................................................................................*........................................... + mul v0.4S, v27.4S, v6.S[2] // ...........................................................................................................................*.................................................. + sqrdmulh v15.4S, v27.4S, v6.S[3] // ............................................................................................................................*................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v13.4S, v2.4S, v11.S[0] // .....................................................................................................................................*........................................ + sqrdmulh v2.4S, v2.4S, v11.S[1] // ......................................................................................................................................*....................................... + srshr v27.4S, v9.4S, #23 // ..........................................................................................................................................*................................... + srshr v31.4S, v17.4S, #23 // ..............................................................................................................................................*............................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v19.4S, v18.4S, v8.4S // .............................................................................................................................................*................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v18.4S, v5.4S, v12.4S // ...........................................................................................................................................................*.................. + sub v5.4S, v5.4S, v12.4S // ..........................................................................................................................................................*................... + mls v0.4S, v15.4S, v8.S[0] // .............................................................................................................................*................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v9.4S, v27.4S, v8.4S // ...........................................................................................................................................*.................................. + mls v17.4S, v31.4S, v8.4S // ...............................................................................................................................................*.............................. + mls v13.4S, v2.4S, v8.S[0] // .......................................................................................................................................*...................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v2.4S, v29.4S, v19.4S // .................................................................................................................................................*............................ + sub v29.4S, v29.4S, v19.4S // ................................................................................................................................................*............................. + str q18, [x1, #32] // ......................................................................................................................................................................*....... + mul v19.4S, v5.4S, v6.S[0] // ............................................................................................................................................................*................. + sqrdmulh v5.4S, v5.4S, v6.S[1] // .............................................................................................................................................................*................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v12.4S, v0.4S, v13.4S // ...............................................................................................................................................................*.............. + add v0.4S, v0.4S, v13.4S // ................................................................................................................................................................*............. + sub v15.4S, v9.4S, v17.4S // .....................................................................................................................................................*........................ + add v9.4S, v9.4S, v17.4S // ......................................................................................................................................................*....................... + str q2, [x1], #(16*4) // ....................................................................................................................................................................*......... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v2.4S, v29.4S, v6.S[0] // ..................................................................................................................................................*........................... + sqrdmulh v29.4S, v29.4S, v6.S[1] // ...................................................................................................................................................*.......................... + mls v19.4S, v5.4S, v8.S[0] // ..............................................................................................................................................................*............... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v5.4S, v12.4S, v6.S[0] // .................................................................................................................................................................*............ + sqrdmulh v12.4S, v12.4S, v6.S[1] // ..................................................................................................................................................................*........... + mul v13.4S, v15.4S, v6.S[0] // .......................................................................................................................................................*...................... + sqrdmulh v15.4S, v15.4S, v6.S[1] // ........................................................................................................................................................*..................... + str q0, [x1, #-16] // .......................................................................................................................................................................*...... + str q9, [x1, #-48] // .....................................................................................................................................................................*........ + add x1, x1, #64 // ............................................................................................................................................................................*. + // gap // .............................................................................................................................................................................. + ldr q0, [x1, #32] // ..e........................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v2.4S, v29.4S, v8.S[0] // ....................................................................................................................................................*......................... + str q19, [x2, #32] // ..........................................................................................................................................................................*... + ldr q18, [x1, #48] // ...e.......................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v5.4S, v12.4S, v8.S[0] // ...................................................................................................................................................................*.......... + mls v13.4S, v15.4S, v8.S[0] // .........................................................................................................................................................*.................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q2, [x2], #(16*4) // ........................................................................................................................................................................*..... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q5, [x2, #-16] // ...........................................................................................................................................................................*.. + str q13, [x2, #-48] // .........................................................................................................................................................................*.... + add x2, x2, #64 // .............................................................................................................................................................................* + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. // original source code - // ldr q9, [x1, #0] // .........................................e....|................................................................................................................................................................e.... - // ldr q10, [x1, #16] // ........................................e.....|...............................................................................................................................................................e..... - // ldr q11, [x1, #32] // .......................................e......|..............................................................................................................................................................e...... - // ldr q12, [x1, #48] // ...................................e..........|..........................................................................................................................................................e.......... - // trn1 v25.4s, v9.4s, v10.4s // ..............................................|....*................................................................................................................................................................ - // trn2 v26.4s, v9.4s, v10.4s // ..............................................|.....*............................................................................................................................................................... - // trn1 v27.4s, v11.4s, v12.4s // ..............................................|.*................................................................................................................................................................... - // trn2 v28.4s, v11.4s, v12.4s // ..............................................|.......*............................................................................................................................................................. - // trn2 v11.2d, v25.2d, v27.2d // ..............................................|.........*........................................................................................................................................................... - // trn2 v12.2d, v26.2d, v28.2d // ..............................................|...............*..................................................................................................................................................... - // trn1 v9.2d, v25.2d, v27.2d // ..............................................|..........*.......................................................................................................................................................... - // trn1 v10.2d, v26.2d, v28.2d // ..............................................|................*.................................................................................................................................................... - // ldr q13, [x2, #0] // ..............................................|..*.................................................................................................................................................................. - // ldr q14, [x2, #16] // ..............................................|*.................................................................................................................................................................... - // ldr q15, [x2, #32] // ..............................................|...*................................................................................................................................................................. - // ldr q16, [x2, #48] // ..............................................*..................................................................................................................................................................... - // trn1 v25.4s, v13.4s, v14.4s // ..............................................|....................*................................................................................................................................................ - // trn2 v26.4s, v13.4s, v14.4s // ..............................................|...................*................................................................................................................................................. - // trn1 v27.4s, v15.4s, v16.4s // ..............................................|.................*................................................................................................................................................... - // trn2 v28.4s, v15.4s, v16.4s // ..............................................|..................*.................................................................................................................................................. - // trn2 v15.2d, v25.2d, v27.2d // ..............................................|.........................*........................................................................................................................................... - // trn2 v16.2d, v26.2d, v28.2d // ..............................................|..........................*.......................................................................................................................................... - // trn1 v13.2d, v25.2d, v27.2d // ..............................................|........................*............................................................................................................................................ - // trn1 v14.2d, v26.2d, v28.2d // ..............................................|...........................*......................................................................................................................................... - // ldr q0, [x5], #(12*16) // ..............................................|...................................*................................................................................................................................. - // ldr q4, [x5, #(-12*16 + 1*16)] // e.............................................|.......................................................................................................................e............................................. - // ldr q1, [x5, #(-12*16 + 2*16)] // ..............................................|............*........................................................................................................................................................ - // ldr q5, [x5, #(-12*16 + 3*16)] // ..............................................|..............*...................................................................................................................................................... - // ldr q2, [x5, #(-12*16 + 4*16)] // ..............................................|.............*....................................................................................................................................................... - // ldr q6, [x5, #(-12*16 + 5*16)] // ..............................................|...........*......................................................................................................................................................... - // sub v24.4s, v9.4s, v10.4s // ..............................................|......................*.............................................................................................................................................. - // add v9.4s, v9.4s, v10.4s // ..............................................|.................................*................................................................................................................................... - // mul v10.4s, v24.4s, v1.4s // ..............................................|..............................*...................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // ..............................................|...............................*..................................................................................................................................... - // mls v10.4s, v24.4s, v8.s[0] // ..............................................|.................................................*................................................................................................................... - // sub v24.4s, v11.4s, v12.4s // ..............................................|.......................*............................................................................................................................................. - // add v11.4s, v11.4s, v12.4s // ..............................................|......................................*.............................................................................................................................. - // mul v12.4s, v24.4s, v2.4s // ..............................................|.............................*....................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ..............................................|.....................................*............................................................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ..............................................|................................................*.................................................................................................................... - // sub v24.4s, v9.4s, v11.4s // ..............................................|..................................................*.................................................................................................................. - // add v9.4s, v9.4s, v11.4s // ..............................................|.....................................................*............................................................................................................... - // mul v11.4s, v24.4s, v0.4s // ..............................................|......................................................*.............................................................................................................. - // sqrdmulh v24.4s, v24.4s, v4.4s // ..............................................|.......................................................*............................................................................................................. - // mls v11.4s, v24.4s, v8.s[0] // ..............................................|.................................................................*................................................................................................... - // sub v24.4s, v10.4s, v12.4s // ..............................................|.........................................................*........................................................................................................... - // add v10.4s, v10.4s, v12.4s // ..............................................|........................................................*............................................................................................................ - // mul v12.4s, v24.4s, v0.4s // ..............................................|...............................................................*..................................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ..............................................|................................................................*.................................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ..............................................|......................................................................*.............................................................................................. - // ldr q0, [x5, #(-12*16 + 6*16)] // ..............................................|...............................................*..................................................................................................................... - // ldr q4, [x5, #(-12*16 + 7*16)] // ..............................................|............................*........................................................................................................................................ - // ldr q1, [x5, #(-12*16 + 8*16)] // ................................e.............|.......................................................................................................................................................e............. - // ldr q5, [x5, #(-12*16 + 9*16)] // ..............................................|........*............................................................................................................................................................ - // ldr q2, [x5, #(-12*16 + 10*16)] // ..............................................|.....................*............................................................................................................................................... - // ldr q6, [x5, #(-12*16 + 11*16)] // ..............................................|......*.............................................................................................................................................................. - // sub v24.4s, v13.4s, v14.4s // ..............................................|..................................*.................................................................................................................................. - // add v13.4s, v13.4s, v14.4s // ..............................................|................................*.................................................................................................................................... - // mul v14.4s, v24.4s, v1.4s // ..............................................|............................................*........................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v5.4s // ..............................................|...........................................*......................................................................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ..............................................|...................................................*................................................................................................................. - // sub v24.4s, v15.4s, v16.4s // ..............................................|....................................*................................................................................................................................ - // add v15.4s, v15.4s, v16.4s // ..............................................|.......................................*............................................................................................................................. - // mul v16.4s, v24.4s, v2.4s // ..............................................|.........................................*........................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ..............................................|..........................................*.......................................................................................................................... - // mls v16.4s, v24.4s, v8.s[0] // ..............................................|....................................................*................................................................................................................ - // sub v24.4s, v13.4s, v15.4s // ..............................................|..............................................*...................................................................................................................... - // add v13.4s, v13.4s, v15.4s // ..............................................|.............................................*....................................................................................................................... - // mul v15.4s, v24.4s, v0.4s // ..............................................|.............................................................*....................................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ..............................................|..........................................................*.......................................................................................................... - // mls v15.4s, v24.4s, v8.s[0] // ..............................................|........................................................................*............................................................................................ - // sub v24.4s, v14.4s, v16.4s // ..............................................|...........................................................*......................................................................................................... - // add v14.4s, v14.4s, v16.4s // ..............................................|............................................................*........................................................................................................ - // mul v16.4s, v24.4s, v0.4s // ..............................................|...................................................................*................................................................................................. - // sqrdmulh v24.4s, v24.4s, v4.4s // ..............................................|....................................................................*................................................................................................ - // mls v16.4s, v24.4s, v8.s[0] // ..............................................|.......................................................................*............................................................................................. - // trn1 v25.4s, v9.4s, v10.4s // ..............................................|..............................................................*...................................................................................................... - // trn2 v26.4s, v9.4s, v10.4s // ..............................................|.....................................................................*............................................................................................... - // trn1 v27.4s, v11.4s, v12.4s // ..............................................|..........................................................................*.......................................................................................... - // trn2 v28.4s, v11.4s, v12.4s // ..............................................|...........................................................................*......................................................................................... - // trn2 v11.2d, v25.2d, v27.2d // ..............................................|...............................................................................*..................................................................................... - // trn2 v12.2d, v26.2d, v28.2d // ..............................................|................................................................................*.................................................................................... - // trn1 v9.2d, v25.2d, v27.2d // ..............................................|.................................................................................*................................................................................... - // trn1 v10.2d, v26.2d, v28.2d // ..............................................|..................................................................................*.................................................................................. - // trn1 v25.4s, v13.4s, v14.4s // ..............................................|.........................................................................*........................................................................................... - // trn2 v26.4s, v13.4s, v14.4s // ..............................................|..................................................................*.................................................................................................. - // trn1 v27.4s, v15.4s, v16.4s // ..............................................|..............................................................................*...................................................................................... - // trn2 v28.4s, v15.4s, v16.4s // ..............................................|.............................................................................*....................................................................................... - // trn2 v15.2d, v25.2d, v27.2d // ..............................................|......................................................................................*.............................................................................. - // trn2 v16.2d, v26.2d, v28.2d // ..............................................|.......................................................................................*............................................................................. - // trn1 v13.2d, v25.2d, v27.2d // ..............................................|....................................................................................*................................................................................ - // trn1 v14.2d, v26.2d, v28.2d // ..............................................|.....................................................................................*............................................................................... - // ldr q0, [x4], #64 // ..............................................|.............................................................................................................*....................................................... - // ldr q1, [x4, #(-64 + 16)] // ..............................................|............................................................................*........................................................................................ - // ldr q2, [x4, #(-64 + 32)] // ..............................................|...................................................................................*................................................................................. - // ldr q3, [x4, #(-64 + 48)] // ..............................................|........................................*............................................................................................................................ - // sub v24.4s, v9.4s, v10.4s // ..............................................|.........................................................................................*........................................................................... - // add v9.4s, v9.4s, v10.4s // ..............................................|......................................................................................................*.............................................................. - // mul v10.4s, v24.4s, v1.s[2] // ..............................................|................................................................................................*.................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..............................................|.................................................................................................*................................................................... - // mls v10.4s, v24.4s, v8.s[0] // ..............................................|..........................................................................................................*.......................................................... - // sub v24.4s, v11.4s, v12.4s // ..............................................|........................................................................................*............................................................................ - // add v11.4s, v11.4s, v12.4s // ..............................................|..........................................................................................*.......................................................................... - // mul v12.4s, v24.4s, v2.s[0] // ..............................................|..............................................................................................*...................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..............................................|...............................................................................................*..................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ..............................................|........................................................................................................*............................................................ - // sub v24.4s, v13.4s, v14.4s // ..............................................|...........................................................................................*......................................................................... - // add v13.4s, v13.4s, v14.4s // ..............................................|.............................................................................................*....................................................................... - // mul v14.4s, v24.4s, v2.s[2] // ..............................................|..................................................................................................*.................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..............................................|....................................................................................................*................................................................ - // mls v14.4s, v24.4s, v8.s[0] // ..............................................|............................................................................................................*........................................................ - // sub v24.4s, v15.4s, v16.4s // ..............................................|............................................................................................*........................................................................ - // add v15.4s, v15.4s, v16.4s // ..............................................|.......................................................................................................*............................................................. - // mul v16.4s, v24.4s, v3.s[0] // ..............................................|...................................................................................................*................................................................. - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ..............................................|.....................................................................................................*............................................................... - // mls v16.4s, v24.4s, v8.s[0] // ..............................................|...........................................................................................................*......................................................... - // sub v24.4s, v9.4s, v11.4s // ..............................................|.........................................................................................................*........................................................... - // add v9.4s, v9.4s, v11.4s // ..............................................|...............................................................................................................*..................................................... - // mul v11.4s, v24.4s, v0.s[2] // ..............................................|.....................................................................................................................*............................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..............................................|......................................................................................................................*.............................................. - // mls v11.4s, v24.4s, v8.s[0] // ..........*...................................|.................................................................................................................................*................................... - // sub v24.4s, v10.4s, v12.4s // ..............................................|..................................................................................................................*.................................................. - // add v10.4s, v10.4s, v12.4s // .........*....................................|................................................................................................................................*.................................... - // mul v12.4s, v24.4s, v0.s[2] // ....*.........................................|...........................................................................................................................*......................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....*........................................|............................................................................................................................*........................................ - // mls v12.4s, v24.4s, v8.s[0] // ..............*...............................|.....................................................................................................................................*............................... - // sub v24.4s, v13.4s, v15.4s // ..............................................|..............................................................................................................*...................................................... - // add v13.4s, v13.4s, v15.4s // ..............................................|................................................................................................................*.................................................... - // mul v15.4s, v24.4s, v1.s[0] // ..............................................|...................................................................................................................*................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................|....................................................................................................................*................................................ - // mls v15.4s, v24.4s, v8.s[0] // ......*.......................................|.............................................................................................................................*....................................... - // sub v24.4s, v14.4s, v16.4s // ..............................................|.................................................................................................................*................................................... - // add v14.4s, v14.4s, v16.4s // ............*.................................|...................................................................................................................................*................................. - // mul v16.4s, v24.4s, v1.s[0] // ...*..........................................|..........................................................................................................................*.......................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..*...........................................|.........................................................................................................................*........................................... - // mls v16.4s, v24.4s, v8.s[0] // .............*................................|....................................................................................................................................*................................ - // sub v24.4s, v9.4s, v13.4s // .*............................................|........................................................................................................................*............................................ - // add v9.4s, v9.4s, v13.4s // .......*......................................|..............................................................................................................................*...................................... - // mul v13.4s, v24.4s, v0.s[0] // ........*.....................................|...............................................................................................................................*..................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........*..................................|..................................................................................................................................*.................................. - // mls v13.4s, v24.4s, v8.s[0] // ....................*.........................|...........................................................................................................................................*......................... - // sub v24.4s, v10.4s, v14.4s // .................*............................|........................................................................................................................................*............................ - // add v10.4s, v10.4s, v14.4s // ................*.............................|.......................................................................................................................................*............................. - // mul v14.4s, v24.4s, v0.s[0] // .........................*....................|................................................................................................................................................*.................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................*.....................|...............................................................................................................................................*..................... - // mls v14.4s, v24.4s, v8.s[0] // ....................................*.........|...........................................................................................................................................................*......... - // sub v24.4s, v11.4s, v15.4s // ..................*...........................|.........................................................................................................................................*........................... - // add v11.4s, v11.4s, v15.4s // ...................*..........................|..........................................................................................................................................*.......................... - // mul v15.4s, v24.4s, v0.s[0] // ...........................*..................|..................................................................................................................................................*.................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................*................|....................................................................................................................................................*................ - // mls v15.4s, v24.4s, v8.s[0] // .....................................*........|............................................................................................................................................................*........ - // sub v24.4s, v12.4s, v16.4s // .....................*........................|............................................................................................................................................*........................ - // add v12.4s, v12.4s, v16.4s // ......................*.......................|.............................................................................................................................................*....................... - // mul v16.4s, v24.4s, v0.s[0] // ..............................*...............|.....................................................................................................................................................*............... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................*..............|......................................................................................................................................................*.............. - // mls v16.4s, v24.4s, v8.s[0] // ......................................*.......|.............................................................................................................................................................*....... - // str q9, [x1], #(16*4) // ...............*..............................|......................................................................................................................................*.............................. - // str q10, [x1, #(-16*4 + 1*16)] // .......................*......................|..............................................................................................................................................*...................... - // str q11, [x1, #(-16*4 + 2*16)] // ..........................*...................|.................................................................................................................................................*................... - // str q12, [x1, #(-16*4 + 3*16)] // ............................*.................|...................................................................................................................................................*................. - // str q13, [x2], #(16*4) // ..................................*...........|.........................................................................................................................................................*........... - // str q14, [x2, #(-16*4 + 1*16)] // ..........................................*...|.................................................................................................................................................................*... - // str q15, [x2, #(-16*4 + 2*16)] // ...........................................*..|..................................................................................................................................................................*.. - // str q16, [x2, #(-16*4 + 3*16)] // ............................................*.|...................................................................................................................................................................*. - // add x1, x1, #64 // .................................*............|........................................................................................................................................................*............ - // add x2, x2, #64 // .............................................*|....................................................................................................................................................................* + // ldr q9, [x1, #0] // ..........*............................................................................................................................................................................. + // ldr q10, [x1, #16] // ..........|*............................................................................................................................................................................ + // ldr q11, [x1, #32] // e.........|...................................................................................................................................................................e......... + // ldr q12, [x1, #48] // ...e......|......................................................................................................................................................................e...... + // trn1 v25.4s, v9.4s, v10.4s // ..........|.............*............................................................................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ..........|..............*.............................................................................................................................................................. + // trn1 v27.4s, v11.4s, v12.4s // ..........|..*.......................................................................................................................................................................... + // trn2 v28.4s, v11.4s, v12.4s // ..........|...*......................................................................................................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // ..........|.........................*................................................................................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // ..........|...........................*................................................................................................................................................. + // trn1 v9.2d, v25.2d, v27.2d // ..........|..........................*.................................................................................................................................................. + // trn1 v10.2d, v26.2d, v28.2d // ..........|............................*................................................................................................................................................ + // ldr q13, [x2, #0] // ..........|.*........................................................................................................................................................................... + // ldr q14, [x2, #16] // ..........|....*........................................................................................................................................................................ + // ldr q15, [x2, #32] // ..........|.....*....................................................................................................................................................................... + // ldr q16, [x2, #48] // ..........|......*...................................................................................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ..........|..................*.......................................................................................................................................................... + // trn2 v26.4s, v13.4s, v14.4s // ..........|...................*......................................................................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ..........|....................*........................................................................................................................................................ + // trn2 v28.4s, v15.4s, v16.4s // ..........|.....................*....................................................................................................................................................... + // trn2 v15.2d, v25.2d, v27.2d // ..........|................................*............................................................................................................................................ + // trn2 v16.2d, v26.2d, v28.2d // ..........|..................................*.......................................................................................................................................... + // trn1 v13.2d, v25.2d, v27.2d // ..........|.................................*........................................................................................................................................... + // trn1 v14.2d, v26.2d, v28.2d // ..........|...................................*......................................................................................................................................... + // ldr q0, [x5], #(12*16) // ..........|........*.................................................................................................................................................................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ..........|.........*................................................................................................................................................................... + // ldr q1, [x5, #(-12*16 + 2*16)] // ..........|.......*..................................................................................................................................................................... + // ldr q5, [x5, #(-12*16 + 3*16)] // ..........|..........*.................................................................................................................................................................. + // ldr q2, [x5, #(-12*16 + 4*16)] // ..........|...........*................................................................................................................................................................. + // ldr q6, [x5, #(-12*16 + 5*16)] // ..........|............*................................................................................................................................................................ + // sub v24.4s, v9.4s, v10.4s // ..........|.......................................*..................................................................................................................................... + // add v9.4s, v9.4s, v10.4s // ..........|........................................*.................................................................................................................................... + // mul v10.4s, v24.4s, v1.4s // ..........|.............................................*............................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ..........|..............................................*.............................................................................................................................. + // mls v10.4s, v24.4s, v8.s[0] // ..........|.........................................................*................................................................................................................... + // sub v24.4s, v11.4s, v12.4s // ..........|.....................................*....................................................................................................................................... + // add v11.4s, v11.4s, v12.4s // ..........|......................................*...................................................................................................................................... + // mul v12.4s, v24.4s, v2.4s // ..........|...............................................*............................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v6.4s // ..........|................................................*............................................................................................................................ + // mls v12.4s, v24.4s, v8.s[0] // ..........|..........................................................*.................................................................................................................. + // sub v24.4s, v9.4s, v11.4s // ..........|.......................................................*..................................................................................................................... + // add v9.4s, v9.4s, v11.4s // ..........|........................................................*.................................................................................................................... + // mul v11.4s, v24.4s, v0.4s // ..........|.............................................................*............................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..........|..............................................................*.............................................................................................................. + // mls v11.4s, v24.4s, v8.s[0] // ..........|.....................................................................*....................................................................................................... + // sub v24.4s, v10.4s, v12.4s // ..........|.................................................................*........................................................................................................... + // add v10.4s, v10.4s, v12.4s // ..........|..................................................................*.......................................................................................................... + // mul v12.4s, v24.4s, v0.4s // ..........|......................................................................*...................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..........|.......................................................................*..................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ..........|...............................................................................*............................................................................................. + // ldr q0, [x5, #(-12*16 + 6*16)] // ..........|................*............................................................................................................................................................ + // ldr q4, [x5, #(-12*16 + 7*16)] // ..........|.................*........................................................................................................................................................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ..........|...............*............................................................................................................................................................. + // ldr q5, [x5, #(-12*16 + 9*16)] // ..........|......................*...................................................................................................................................................... + // ldr q2, [x5, #(-12*16 + 10*16)] // ..........|.......................*..................................................................................................................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ..........|........................*.................................................................................................................................................... + // sub v24.4s, v13.4s, v14.4s // ..........|...........................................*................................................................................................................................. + // add v13.4s, v13.4s, v14.4s // ..........|............................................*................................................................................................................................ + // mul v14.4s, v24.4s, v1.4s // ..........|.................................................*........................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ..........|..................................................*.......................................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ..........|...........................................................*................................................................................................................. + // sub v24.4s, v15.4s, v16.4s // ..........|.........................................*................................................................................................................................... + // add v15.4s, v15.4s, v16.4s // ..........|..........................................*.................................................................................................................................. + // mul v16.4s, v24.4s, v2.4s // ..........|...................................................*......................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ..........|....................................................*........................................................................................................................ + // mls v16.4s, v24.4s, v8.s[0] // ..........|............................................................*................................................................................................................ + // sub v24.4s, v13.4s, v15.4s // ..........|.....................................................*....................................................................................................................... + // add v13.4s, v13.4s, v15.4s // ..........|......................................................*...................................................................................................................... + // mul v15.4s, v24.4s, v0.4s // ..........|...............................................................*............................................................................................................. + // sqrdmulh v24.4s, v24.4s, v4.4s // ..........|................................................................*............................................................................................................ + // mls v15.4s, v24.4s, v8.s[0] // ..........|........................................................................*.................................................................................................... + // sub v24.4s, v14.4s, v16.4s // ..........|...................................................................*......................................................................................................... + // add v14.4s, v14.4s, v16.4s // ..........|....................................................................*........................................................................................................ + // mul v16.4s, v24.4s, v0.4s // ..........|..........................................................................*.................................................................................................. + // sqrdmulh v24.4s, v24.4s, v4.4s // ..........|...........................................................................*................................................................................................. + // mls v16.4s, v24.4s, v8.s[0] // ..........|................................................................................*............................................................................................ + // trn1 v25.4s, v9.4s, v10.4s // ..........|.........................................................................*................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ..........|............................................................................*................................................................................................ + // trn1 v27.4s, v11.4s, v12.4s // ..........|.................................................................................*........................................................................................... + // trn2 v28.4s, v11.4s, v12.4s // ..........|..................................................................................*.......................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // ..........|.....................................................................................*....................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // ..........|.......................................................................................*..................................................................................... + // trn1 v9.2d, v25.2d, v27.2d // ..........|......................................................................................*...................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ..........|........................................................................................*.................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ..........|.............................................................................*............................................................................................... + // trn2 v26.4s, v13.4s, v14.4s // ..........|..............................................................................*.............................................................................................. + // trn1 v27.4s, v15.4s, v16.4s // ..........|...................................................................................*......................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // ..........|....................................................................................*........................................................................................ + // trn2 v15.2d, v25.2d, v27.2d // ..........|.........................................................................................*................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ..........|...........................................................................................*................................................................................. + // trn1 v13.2d, v25.2d, v27.2d // ..........|..........................................................................................*.................................................................................. + // trn1 v14.2d, v26.2d, v28.2d // ..........|............................................................................................*................................................................................ + // ldr q0, [x4], #64 // ..........|...............................*............................................................................................................................................. + // ldr q1, [x4, #(-64 + 16)] // ..........|..............................*.............................................................................................................................................. + // ldr q2, [x4, #(-64 + 32)] // ..........|.............................*............................................................................................................................................... + // ldr q3, [x4, #(-64 + 48)] // ..........|....................................*........................................................................................................................................ + // sub v24.4s, v9.4s, v10.4s // ..........|...............................................................................................*............................................................................. + // add v9.4s, v9.4s, v10.4s // ..........|................................................................................................*............................................................................ + // mul v10.4s, v24.4s, v1.s[2] // ..........|.......................................................................................................*..................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..........|........................................................................................................*.................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ..........|.................................................................................................................*........................................................... + // sub v24.4s, v11.4s, v12.4s // ..........|.............................................................................................*............................................................................... + // add v11.4s, v11.4s, v12.4s // ..........|..............................................................................................*.............................................................................. + // mul v12.4s, v24.4s, v2.s[0] // ..........|.....................................................................................................*....................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........|......................................................................................................*...................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ..........|..................................................................................................................*.......................................................... + // sub v24.4s, v13.4s, v14.4s // ..........|...................................................................................................*......................................................................... + // add v13.4s, v13.4s, v14.4s // ..........|....................................................................................................*........................................................................ + // mul v14.4s, v24.4s, v2.s[2] // ..........|.........................................................................................................*................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..........|..........................................................................................................*.................................................................. + // mls v14.4s, v24.4s, v8.s[0] // ..........|...................................................................................................................*......................................................... + // sub v24.4s, v15.4s, v16.4s // ..........|.................................................................................................*........................................................................... + // add v15.4s, v15.4s, v16.4s // ..........|..................................................................................................*.......................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ..........|...........................................................................................................*................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ..........|............................................................................................................*................................................................ + // mls v16.4s, v24.4s, v8.s[0] // ..........|....................................................................................................................*........................................................ + // sub v24.4s, v9.4s, v11.4s // ..........|.............................................................................................................*............................................................... + // add v9.4s, v9.4s, v11.4s // ..........|..............................................................................................................*.............................................................. + // mul v11.4s, v24.4s, v0.s[2] // ..........|.....................................................................................................................*....................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..........|......................................................................................................................*...................................................... + // mls v11.4s, v24.4s, v8.s[0] // ..........|...............................................................................................................................*............................................. + // sub v24.4s, v10.4s, v12.4s // ..........|...........................................................................................................................*................................................. + // add v10.4s, v10.4s, v12.4s // ..........|............................................................................................................................*................................................ + // mul v12.4s, v24.4s, v0.s[2] // ..........|..................................................................................................................................*.......................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..........|...................................................................................................................................*......................................... + // mls v12.4s, v24.4s, v8.s[0] // ..........|...........................................................................................................................................*................................. + // sub v24.4s, v13.4s, v15.4s // ..........|...............................................................................................................*............................................................. + // add v13.4s, v13.4s, v15.4s // ..........|................................................................................................................*............................................................ + // mul v15.4s, v24.4s, v1.s[0] // ..........|.......................................................................................................................*..................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........|........................................................................................................................*.................................................... + // mls v15.4s, v24.4s, v8.s[0] // ..........|.................................................................................................................................*........................................... + // sub v24.4s, v14.4s, v16.4s // ..........|.............................................................................................................................*............................................... + // add v14.4s, v14.4s, v16.4s // ..........|..............................................................................................................................*.............................................. + // mul v16.4s, v24.4s, v1.s[0] // ..........|....................................................................................................................................*........................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........|.....................................................................................................................................*....................................... + // mls v16.4s, v24.4s, v8.s[0] // ..........|..............................................................................................................................................*.............................. + // srshr v24.4S, v9.4S, #23 // ..........|.........................................................................................................................*................................................... + // mls v9.4s, v24.4s, v8.4s // ..........|................................................................................................................................*............................................ + // srshr v24.4S, v10.4S, #23 // ..........|......................................................................................................................................*...................................... + // mls v10.4s, v24.4s, v8.4s // ..........|............................................................................................................................................*................................ + // srshr v24.4S, v13.4S, #23 // ..........|..........................................................................................................................*.................................................. + // mls v13.4s, v24.4s, v8.4s // ..........|........................................................................................................................................*.................................... + // srshr v24.4S, v14.4S, #23 // ..........|.......................................................................................................................................*..................................... + // mls v14.4s, v24.4s, v8.4s // ..........|.............................................................................................................................................*............................... + // sub v24.4s, v9.4s, v13.4s // ..........|................................................................................................................................................*............................ + // add v9.4s, v9.4s, v13.4s // ..........|...............................................................................................................................................*............................. + // mul v13.4s, v24.4s, v0.s[0] // ..........|.........................................................................................................................................................*................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........|..........................................................................................................................................................*.................. + // mls v13.4s, v24.4s, v8.s[0] // .*........|....................................................................................................................................................................*........ + // sub v24.4s, v10.4s, v14.4s // ..........|......................................................................................................................................................*...................... + // add v10.4s, v10.4s, v14.4s // ..........|.......................................................................................................................................................*..................... + // mul v14.4s, v24.4s, v0.s[0] // ..........|..............................................................................................................................................................*.............. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........|...............................................................................................................................................................*............. + // mls v14.4s, v24.4s, v8.s[0] // .....*....|........................................................................................................................................................................*.... + // sub v24.4s, v11.4s, v15.4s // ..........|..........................................................................................................................................*.................................. + // add v11.4s, v11.4s, v15.4s // ..........|.........................................................................................................................................*................................... + // mul v15.4s, v24.4s, v0.s[0] // ..........|..................................................................................................................................................*.......................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........|...................................................................................................................................................*......................... + // mls v15.4s, v24.4s, v8.s[0] // ..........|...........................................................................................................................................................*................. + // sub v24.4s, v12.4s, v16.4s // ..........|....................................................................................................................................................*........................ + // add v12.4s, v12.4s, v16.4s // ..........|.....................................................................................................................................................*....................... + // mul v16.4s, v24.4s, v0.s[0] // ..........|............................................................................................................................................................*................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........|.............................................................................................................................................................*............... + // mls v16.4s, v24.4s, v8.s[0] // ....*.....|.......................................................................................................................................................................*..... + // str q9, [x1], #(16*4) // ..........|........................................................................................................................................................*.................... + // str q10, [x1, #(-16*4 + 1*16)] // ..........|.................................................................................................................................................................*........... + // str q11, [x1, #(-16*4 + 2*16)] // ..........|.................................................................................................................................................*........................... + // str q12, [x1, #(-16*4 + 3*16)] // ..........|................................................................................................................................................................*............ + // str q13, [x2], #(16*4) // ......*...|.........................................................................................................................................................................*... + // str q14, [x2, #(-16*4 + 1*16)] // ........*.|...........................................................................................................................................................................*. + // str q15, [x2, #(-16*4 + 2*16)] // ..*.......|.....................................................................................................................................................................*....... + // str q16, [x2, #(-16*4 + 3*16)] // .......*..|..........................................................................................................................................................................*.. + // add x1, x1, #64 // ..........|..................................................................................................................................................................*.......... + // add x2, x2, #64 // .........*|............................................................................................................................................................................* sub count, count, #1 cbnz count, layer45678_start - trn1 v10.4S, v20.4S, v0.4S // ..*............................................................................................................................................................. - trn2 v22.4S, v20.4S, v0.4S // ........*....................................................................................................................................................... - trn1 v26.4S, v27.4S, v1.4S // .....*.......................................................................................................................................................... - trn2 v7.4S, v27.4S, v1.4S // ......*......................................................................................................................................................... - ldr q27, [x2, #16] // .*.............................................................................................................................................................. - ldr q13, [x2, #0] // ...*............................................................................................................................................................ - ldr q15, [x2, #48] // *............................................................................................................................................................... - // gap // ................................................................................................................................................................ - ldr q24, [x2, #32] // ....*........................................................................................................................................................... - ldr q6, [x5, #80] // ............*................................................................................................................................................... - ldr q11, [x5, #176] // .......*........................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - trn2 v23.2D, v26.2D, v10.2D // ..........*..................................................................................................................................................... - trn1 v10.2D, v26.2D, v10.2D // ...........*.................................................................................................................................................... - trn2 v26.2D, v7.2D, v22.2D // ................*............................................................................................................................................... - trn1 v22.2D, v7.2D, v22.2D // .................*.............................................................................................................................................. - ldr q7, [x5, #32] // .............*.................................................................................................................................................. - ldr q12, [x5, #64] // ..............*................................................................................................................................................. - ldr q31, [x5, #48] // ...............*................................................................................................................................................ - // gap // ................................................................................................................................................................ - ldr q5, [x5, #144] // .........*...................................................................................................................................................... - ldr q9, [x5, #160] // ......................*......................................................................................................................................... - ldr q16, [x5, #112] // .............................*.................................................................................................................................. - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - trn2 v20.4S, v13.4S, v27.4S // ....................*........................................................................................................................................... - trn1 v27.4S, v13.4S, v27.4S // .....................*.......................................................................................................................................... - sub v13.4S, v23.4S, v26.4S // ........................*....................................................................................................................................... - sub v4.4S, v10.4S, v22.4S // .......................*........................................................................................................................................ - ldr q17, [x5], #(12*16) // ....................................*........................................................................................................................... - ldr q1, [x5, #-96] // ................................................*............................................................................................................... - ldr q19, [x4, #48] // .........................................*...................................................................................................................... - // gap // ................................................................................................................................................................ - trn2 v21.4S, v24.4S, v15.4S // ...................*............................................................................................................................................ - trn1 v15.4S, v24.4S, v15.4S // ..................*............................................................................................................................................. - add v26.4S, v23.4S, v26.4S // .......................................*........................................................................................................................ - add v10.4S, v10.4S, v22.4S // ..................................*............................................................................................................................. - ldr q22, [x4, #32] // ....................................................................................*........................................................................... - ldr q24, [x4, #16] // .............................................................................*.................................................................................. - ldr q23, [x4], #64 // ..............................................................................................................*................................................. - // gap // ................................................................................................................................................................ - sqrdmulh v6.4S, v13.4S, v6.4S // ......................................*......................................................................................................................... - mul v7.4S, v4.4S, v7.4S // ...............................*................................................................................................................................ - mul v13.4S, v13.4S, v12.4S // ..............................*................................................................................................................................. - sqrdmulh v12.4S, v4.4S, v31.4S // ................................*............................................................................................................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - trn2 v31.2D, v20.2D, v21.2D // ...........................*.................................................................................................................................... - trn1 v20.2D, v20.2D, v21.2D // ............................*................................................................................................................................... - trn2 v4.2D, v27.2D, v15.2D // ..........................*..................................................................................................................................... - trn1 v27.2D, v27.2D, v15.2D // .........................*...................................................................................................................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - sub v15.4S, v10.4S, v26.4S // ...................................................*............................................................................................................ - add v10.4S, v10.4S, v26.4S // ......................................................*......................................................................................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - mls v13.4S, v6.4S, v8.S[0] // .................................................*.............................................................................................................. - mls v7.4S, v12.4S, v8.S[0] // ..................................................*............................................................................................................. - sub v26.4S, v27.4S, v20.4S // ...................................*............................................................................................................................ - sub v6.4S, v4.4S, v31.4S // .....................................*.......................................................................................................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - add v12.4S, v4.4S, v31.4S // ........................................*....................................................................................................................... - add v27.4S, v27.4S, v20.4S // .................................*.............................................................................................................................. - mul v31.4S, v15.4S, v17.4S // .......................................................*........................................................................................................ - sqrdmulh v15.4S, v15.4S, v30.4S // ........................................................*....................................................................................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - mul v18.4S, v26.4S, v18.4S // .............................................*.................................................................................................................. - sqrdmulh v11.4S, v6.4S, v11.4S // ...........................................*.................................................................................................................... - sqrdmulh v26.4S, v26.4S, v5.4S // ............................................*................................................................................................................... - mul v6.4S, v6.4S, v9.4S // ..........................................*..................................................................................................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - sub v5.4S, v7.4S, v13.4S // ..........................................................*..................................................................................................... - add v7.4S, v7.4S, v13.4S // .........................................................*...................................................................................................... - sub v13.4S, v27.4S, v12.4S // ...............................................*................................................................................................................ - add v27.4S, v27.4S, v12.4S // ..............................................*................................................................................................................. - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - mls v31.4S, v15.4S, v8.S[0] // ..................................................................*............................................................................................. - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - sqrdmulh v30.4S, v5.4S, v30.4S // .................................................................*.............................................................................................. - mls v18.4S, v26.4S, v8.S[0] // ....................................................*........................................................................................................... - mls v6.4S, v11.4S, v8.S[0] // .....................................................*.......................................................................................................... - sqrdmulh v26.4S, v13.4S, v16.4S // ...........................................................*.................................................................................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - mul v15.4S, v5.4S, v17.4S // ................................................................*............................................................................................... - trn1 v11.4S, v10.4S, v7.4S // ...............................................................*................................................................................................ - trn2 v10.4S, v10.4S, v7.4S // ......................................................................*......................................................................................... - mul v7.4S, v13.4S, v1.4S // ..............................................................*................................................................................................. - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - sub v13.4S, v18.4S, v6.4S // ............................................................*................................................................................................... - add v18.4S, v18.4S, v6.4S // .............................................................*.................................................................................................. - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - mls v15.4S, v30.4S, v8.S[0] // .......................................................................*........................................................................................ - mls v7.4S, v26.4S, v8.S[0] // .........................................................................*...................................................................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - sqrdmulh v26.4S, v13.4S, v16.4S // .....................................................................*.......................................................................................... - mul v30.4S, v13.4S, v1.4S // ....................................................................*........................................................................................... - trn2 v13.4S, v27.4S, v18.4S // ...................................................................*............................................................................................ - trn1 v18.4S, v27.4S, v18.4S // ..........................................................................*..................................................................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - trn1 v27.4S, v31.4S, v15.4S // ...........................................................................*.................................................................................... - trn2 v15.4S, v31.4S, v15.4S // ............................................................................*................................................................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - mls v30.4S, v26.4S, v8.S[0] // ........................................................................*....................................................................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - trn2 v26.2D, v11.2D, v27.2D // ................................................................................*............................................................................... - trn1 v27.2D, v11.2D, v27.2D // ..................................................................................*............................................................................. - trn2 v6.2D, v10.2D, v15.2D // .................................................................................*.............................................................................. - trn1 v10.2D, v10.2D, v15.2D // ...................................................................................*............................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - trn2 v15.4S, v7.4S, v30.4S // ..............................................................................*................................................................................. - trn1 v7.4S, v7.4S, v30.4S // ...............................................................................*................................................................................ - sub v30.4S, v26.4S, v6.4S // .........................................................................................*...................................................................... - sub v11.4S, v27.4S, v10.4S // ..........................................................................................*..................................................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - add v26.4S, v26.4S, v6.4S // ...........................................................................................*.................................................................... - add v10.4S, v27.4S, v10.4S // .......................................................................................................*........................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - trn1 v27.2D, v13.2D, v15.2D // ......................................................................................*......................................................................... - trn2 v13.2D, v13.2D, v15.2D // ........................................................................................*....................................................................... - trn1 v15.2D, v18.2D, v7.2D // .....................................................................................*.......................................................................... - trn2 v18.2D, v18.2D, v7.2D // .......................................................................................*........................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - mul v7.4S, v30.4S, v22.S[0] // ...............................................................................................*................................................................ - sqrdmulh v30.4S, v30.4S, v22.S[1] // ................................................................................................*............................................................... - mul v6.4S, v11.4S, v24.S[2] // .................................................................................................*.............................................................. - sqrdmulh v11.4S, v11.4S, v24.S[3] // ..................................................................................................*............................................................. - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - sub v12.4S, v15.4S, v27.4S // ............................................................................................*................................................................... - add v27.4S, v15.4S, v27.4S // ..............................................................................................*................................................................. - sub v15.4S, v18.4S, v13.4S // .............................................................................................*.................................................................. - add v18.4S, v18.4S, v13.4S // ........................................................................................................*....................................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - sub v13.4S, v10.4S, v26.4S // ..........................................................................................................*..................................................... - add v10.4S, v10.4S, v26.4S // ................................................................................................................*............................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - mul v26.4S, v15.4S, v19.S[0] // ....................................................................................................*........................................................... - sqrdmulh v15.4S, v15.4S, v19.S[1] // ......................................................................................................*......................................................... - mul v31.4S, v12.4S, v22.S[2] // ...................................................................................................*............................................................ - sqrdmulh v22.4S, v12.4S, v22.S[3] // .....................................................................................................*.......................................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - mls v7.4S, v30.4S, v8.S[0] // .........................................................................................................*...................................................... - mls v6.4S, v11.4S, v8.S[0] // ...........................................................................................................*.................................................... - sub v30.4S, v27.4S, v18.4S // ...............................................................................................................*................................................ - add v18.4S, v27.4S, v18.4S // .................................................................................................................*.............................................. - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - mul v11.4S, v13.4S, v23.S[2] // ......................................................................................................................*......................................... - sqrdmulh v27.4S, v13.4S, v23.S[3] // .......................................................................................................................*........................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - mls v26.4S, v15.4S, v8.S[0] // ............................................................................................................*................................................... - mls v31.4S, v22.4S, v8.S[0] // .............................................................................................................*.................................................. - mul v22.4S, v30.4S, v24.S[0] // ....................................................................................................................*........................................... - sqrdmulh v30.4S, v30.4S, v24.S[1] // .....................................................................................................................*.......................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - add v13.4S, v10.4S, v18.4S // ..............................................................................................................................*................................. - sub v10.4S, v10.4S, v18.4S // ........................................................................................................................*....................................... - sub v18.4S, v6.4S, v7.4S // ...................................................................................................................*............................................ - add v7.4S, v6.4S, v7.4S // ................................................................................................................................*............................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - mls v11.4S, v27.4S, v8.S[0] // .................................................................................................................................*.............................. - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - mls v22.4S, v30.4S, v8.S[0] // .............................................................................................................................*.................................. - str q13, [x1], #(16*4) // ......................................................................................................................................*......................... - mul v13.4S, v18.4S, v23.S[2] // ...........................................................................................................................*.................................... - sqrdmulh v18.4S, v18.4S, v23.S[3] // ............................................................................................................................*................................... - sub v30.4S, v31.4S, v26.4S // ..................................................................................................................*............................................. - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - add v26.4S, v31.4S, v26.4S // ...................................................................................................................................*............................ - mul v15.4S, v10.4S, v23.S[0] // ...............................................................................................................................*................................ - sqrdmulh v10.4S, v10.4S, v23.S[1] // ..................................................................................................................................*............................. - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - sqrdmulh v6.4S, v30.4S, v24.S[1] // .........................................................................................................................*...................................... - mul v27.4S, v30.4S, v24.S[0] // ..........................................................................................................................*..................................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - add v30.4S, v7.4S, v26.4S // .......................................................................................................................................*........................ - sub v26.4S, v7.4S, v26.4S // ........................................................................................................................................*....................... - mls v13.4S, v18.4S, v8.S[0] // .....................................................................................................................................*.......................... - sub v18.4S, v11.4S, v22.4S // .........................................................................................................................................*...................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - add v22.4S, v11.4S, v22.4S // ..........................................................................................................................................*..................... - mls v15.4S, v10.4S, v8.S[0] // ...........................................................................................................................................*.................... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - mls v27.4S, v6.4S, v8.S[0] // ....................................................................................................................................*........................... - str q30, [x1, #-48] // ..............................................................................................................................................*................. - sqrdmulh v10.4S, v26.4S, v23.S[1] // ...............................................................................................................................................*................ - mul v26.4S, v26.4S, v23.S[0] // ................................................................................................................................................*............... - mul v7.4S, v18.4S, v23.S[0] // ..................................................................................................................................................*............. - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - sqrdmulh v18.4S, v18.4S, v23.S[1] // ....................................................................................................................................................*........... - str q22, [x1, #-32] // .................................................................................................................................................*.............. - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - str q15, [x2], #(16*4) // ........................................................................................................................................................*....... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - sub v30.4S, v13.4S, v27.4S // ............................................................................................................................................*................... - add v22.4S, v13.4S, v27.4S // .............................................................................................................................................*.................. - mls v26.4S, v10.4S, v8.S[0] // .........................................................................................................................................................*...... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - mls v7.4S, v18.4S, v8.S[0] // ..........................................................................................................................................................*..... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - str q22, [x1, #-16] // ...................................................................................................................................................*............ - add x1, x1, #64 // .......................................................................................................................................................*........ - mul v10.4S, v30.4S, v23.S[0] // .....................................................................................................................................................*.......... - sqrdmulh v18.4S, v30.4S, v23.S[1] // ......................................................................................................................................................*......... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - str q26, [x2, #-48] // ............................................................................................................................................................*... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - str q7, [x2, #-32] // .............................................................................................................................................................*.. - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - mls v10.4S, v18.4S, v8.S[0] // ...........................................................................................................................................................*.... - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - str q10, [x2, #-16] // ..............................................................................................................................................................*. - add x2, x2, #64 // ...............................................................................................................................................................* - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ - // gap // ................................................................................................................................................................ + ldr q12, [x2, #32] // ......*..................................................................................................................................................................... + trn2 v9.4S, v0.4S, v18.4S // ....*....................................................................................................................................................................... + ldr q29, [x1, #0] // *........................................................................................................................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + ldr q5, [x1, #16] // .*.......................................................................................................................................................................... + // gap // ............................................................................................................................................................................ + ldr q15, [x2, #48] // .......*.................................................................................................................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + ldr q19, [x2, #0] // ..*......................................................................................................................................................................... + ldr q2, [x2, #16] // .....*...................................................................................................................................................................... + ldr q28, [x5, #160] // ........................*................................................................................................................................................... + ldr q13, [x5], #(12*16) // .........*.................................................................................................................................................................. + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + trn1 v17.4S, v29.4S, v5.4S // ..............*............................................................................................................................................................. + trn2 v14.4S, v29.4S, v5.4S // ...............*............................................................................................................................................................ + ldr q29, [x5, #-144] // ...........*................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + trn1 v5.4S, v0.4S, v18.4S // ...*........................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + trn2 v0.4S, v19.4S, v2.4S // ....................*....................................................................................................................................................... + trn1 v10.4S, v12.4S, v15.4S // .....................*...................................................................................................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + trn2 v27.4S, v12.4S, v15.4S // ......................*..................................................................................................................................................... + trn1 v12.4S, v19.4S, v2.4S // ...................*........................................................................................................................................................ + ldr q19, [x5, #-160] // ........*................................................................................................................................................................... + trn2 v1.2D, v17.2D, v5.2D // ..........................*................................................................................................................................................. + trn1 v18.2D, v17.2D, v5.2D // ...........................*................................................................................................................................................ + trn2 v31.2D, v14.2D, v9.2D // ............................*............................................................................................................................................... + trn1 v25.2D, v14.2D, v9.2D // .............................*.............................................................................................................................................. + ldr q9, [x5, #-128] // ............*............................................................................................................................................................... + ldr q15, [x5, #-112] // .............*.............................................................................................................................................................. + ldr q5, [x5, #-64] // ................*........................................................................................................................................................... + // gap // ............................................................................................................................................................................ + trn1 v4.2D, v0.2D, v27.2D // ....................................*....................................................................................................................................... + trn2 v17.2D, v0.2D, v27.2D // ...................................*........................................................................................................................................ + trn1 v27.2D, v12.2D, v10.2D // ..................................*......................................................................................................................................... + trn2 v12.2D, v12.2D, v10.2D // .................................*.......................................................................................................................................... + ldr q2, [x5, #-48] // .......................*.................................................................................................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + ldr q0, [x5, #-16] // .........................*.................................................................................................................................................. + sub v3.4S, v1.4S, v31.4S // ......................................*..................................................................................................................................... + add v31.4S, v1.4S, v31.4S // .......................................*.................................................................................................................................... + sub v22.4S, v18.4S, v25.4S // ........................................*................................................................................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + sub v16.4S, v27.4S, v4.4S // ............................................*............................................................................................................................... + sub v1.4S, v12.4S, v17.4S // ..........................................*................................................................................................................................. + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mul v14.4S, v22.4S, v19.4S // ..............................................*............................................................................................................................. + ldr q19, [x5, #-80] // ..................*......................................................................................................................................................... + sqrdmulh v29.4S, v22.4S, v29.4S // ...............................................*............................................................................................................................ + mul v20.4S, v3.4S, v9.4S // ................................................*........................................................................................................................... + sqrdmulh v3.4S, v3.4S, v15.4S // .................................................*.......................................................................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mul v22.4S, v1.4S, v28.4S // ....................................................*....................................................................................................................... + sqrdmulh v26.4S, v16.4S, v2.4S // ...................................................*........................................................................................................................ + mul v9.4S, v16.4S, v5.4S // ..................................................*......................................................................................................................... + sqrdmulh v28.4S, v1.4S, v0.4S // .....................................................*...................................................................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + add v16.4S, v18.4S, v25.4S // .........................................*.................................................................................................................................. + add v30.4S, v12.4S, v17.4S // ...........................................*................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mls v14.4S, v29.4S, v8.S[0] // ..........................................................*................................................................................................................. + ldr q29, [x5, #-176] // ..........*................................................................................................................................................................. + mls v20.4S, v3.4S, v8.S[0] // ...........................................................*................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mls v22.4S, v28.4S, v8.S[0] // .............................................................*.............................................................................................................. + mls v9.4S, v26.4S, v8.S[0] // ............................................................*............................................................................................................... + sub v18.4S, v16.4S, v31.4S // ........................................................*................................................................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + add v10.4S, v27.4S, v4.4S // .............................................*.............................................................................................................................. + ldr q0, [x5, #-96] // .................*.......................................................................................................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + add v5.4S, v14.4S, v20.4S // ...................................................................*........................................................................................................ + sub v25.4S, v14.4S, v20.4S // ..................................................................*......................................................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mul v3.4S, v18.4S, v13.4S // ..............................................................*............................................................................................................. + sub v17.4S, v10.4S, v30.4S // ......................................................*..................................................................................................................... + sub v27.4S, v9.4S, v22.4S // ....................................................................*....................................................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mul v28.4S, v25.4S, v13.4S // .......................................................................*.................................................................................................... + sqrdmulh v18.4S, v18.4S, v29.4S // ...............................................................*............................................................................................................ + sqrdmulh v25.4S, v25.4S, v29.4S // ........................................................................*................................................................................................... + ldr q29, [x4, #16] // ...............................*............................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + sqrdmulh v12.4S, v17.4S, v19.4S // .................................................................*.......................................................................................................... + sqrdmulh v13.4S, v27.4S, v19.4S // ............................................................................*............................................................................................... + mul v2.4S, v17.4S, v0.4S // ................................................................*........................................................................................................... + mul v27.4S, v27.4S, v0.4S // ...........................................................................*................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + add v14.4S, v10.4S, v30.4S // .......................................................*.................................................................................................................... + add v19.4S, v16.4S, v31.4S // .........................................................*.................................................................................................................. + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mls v28.4S, v25.4S, v8.S[0] // ................................................................................*........................................................................................... + mls v3.4S, v18.4S, v8.S[0] // ......................................................................*..................................................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mls v2.4S, v12.4S, v8.S[0] // .........................................................................*.................................................................................................. + add v12.4S, v9.4S, v22.4S // .....................................................................*...................................................................................................... + mls v27.4S, v13.4S, v8.S[0] // .................................................................................*.......................................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + trn1 v7.4S, v19.4S, v5.4S // ..........................................................................*................................................................................................. + trn2 v9.4S, v19.4S, v5.4S // .............................................................................*.............................................................................................. + ldr q19, [x4, #32] // ..............................*............................................................................................................................................. + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + trn2 v18.4S, v3.4S, v28.4S // ...................................................................................*........................................................................................ + trn1 v15.4S, v3.4S, v28.4S // ..................................................................................*......................................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + trn1 v5.4S, v14.4S, v12.4S // ..............................................................................*............................................................................................. + trn2 v0.4S, v14.4S, v12.4S // ...............................................................................*............................................................................................ + trn2 v17.4S, v2.4S, v27.4S // .....................................................................................*...................................................................................... + trn1 v2.4S, v2.4S, v27.4S // ....................................................................................*....................................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + trn1 v13.2D, v9.2D, v18.2D // .........................................................................................*.................................................................................. + trn2 v18.2D, v9.2D, v18.2D // ........................................................................................*................................................................................... + trn2 v9.2D, v7.2D, v15.2D // ......................................................................................*..................................................................................... + trn1 v15.2D, v7.2D, v15.2D // .......................................................................................*.................................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + trn2 v12.2D, v0.2D, v17.2D // ............................................................................................*............................................................................... + trn1 v17.2D, v0.2D, v17.2D // .............................................................................................*.............................................................................. + trn1 v0.2D, v5.2D, v2.2D // ...........................................................................................*................................................................................ + trn2 v31.2D, v5.2D, v2.2D // ..........................................................................................*................................................................................. + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + ldr q5, [x4, #48] // .....................................*...................................................................................................................................... + sub v25.4S, v9.4S, v18.4S // ..............................................................................................*............................................................................. + add v18.4S, v9.4S, v18.4S // ...............................................................................................*............................................................................ + sub v9.4S, v15.4S, v13.4S // ................................................................................................*........................................................................... + add v2.4S, v15.4S, v13.4S // .................................................................................................*.......................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + sub v15.4S, v31.4S, v12.4S // ..................................................................................................*......................................................................... + sub v13.4S, v0.4S, v17.4S // ....................................................................................................*....................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + sqrdmulh v28.4S, v25.4S, v19.S[1] // .......................................................................................................*.................................................................... + mul v25.4S, v25.4S, v19.S[0] // ......................................................................................................*..................................................................... + sqrdmulh v3.4S, v9.4S, v29.S[3] // .........................................................................................................*.................................................................. + mul v9.4S, v9.4S, v29.S[2] // ........................................................................................................*................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mul v20.4S, v15.4S, v5.S[0] // ............................................................................................................*............................................................... + sqrdmulh v27.4S, v15.4S, v5.S[1] // .............................................................................................................*.............................................................. + mul v15.4S, v13.4S, v19.S[2] // ..........................................................................................................*................................................................. + sqrdmulh v22.4S, v13.4S, v19.S[3] // ...........................................................................................................*................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + ldr q5, [x4], #64 // ................................*........................................................................................................................................... + add v19.4S, v31.4S, v12.4S // ...................................................................................................*........................................................................ + add v13.4S, v0.4S, v17.4S // .....................................................................................................*...................................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mls v9.4S, v3.4S, v8.S[0] // ..................................................................................................................*......................................................... + mls v25.4S, v28.4S, v8.S[0] // ...................................................................................................................*........................................................ + sub v31.4S, v2.4S, v18.4S // ..............................................................................................................*............................................................. + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mls v15.4S, v22.4S, v8.S[0] // ....................................................................................................................*....................................................... + sub v0.4S, v13.4S, v19.4S // ................................................................................................................*........................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mls v20.4S, v27.4S, v8.S[0] // .....................................................................................................................*...................................................... + add v17.4S, v13.4S, v19.4S // .................................................................................................................*.......................................................... + sqrdmulh v13.4S, v31.4S, v5.S[3] // .......................................................................................................................*.................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + add v2.4S, v2.4S, v18.4S // ...............................................................................................................*............................................................ + sub v18.4S, v9.4S, v25.4S // ............................................................................................................................*............................................... + add v12.4S, v9.4S, v25.4S // .............................................................................................................................*.............................................. + mul v9.4S, v31.4S, v5.S[2] // ......................................................................................................................*..................................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + sub v14.4S, v15.4S, v20.4S // ..............................................................................................................................*............................................. + add v28.4S, v15.4S, v20.4S // ...............................................................................................................................*............................................ + mul v15.4S, v0.4S, v29.S[0] // ........................................................................................................................*................................................... + sqrdmulh v0.4S, v0.4S, v29.S[1] // .........................................................................................................................*.................................................. + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + srshr v27.4S, v17.4S, #23 // ...........................................................................................................................*................................................ + srshr v25.4S, v12.4S, #23 // .......................................................................................................................................*.................................... + srshr v31.4S, v2.4S, #23 // ..........................................................................................................................*................................................. + sqrdmulh v22.4S, v18.4S, v5.S[3] // ....................................................................................................................................*....................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mul v19.4S, v18.4S, v5.S[2] // ...................................................................................................................................*........................................ + sqrdmulh v3.4S, v14.4S, v29.S[1] // ......................................................................................................................................*..................................... + mul v18.4S, v14.4S, v29.S[0] // .....................................................................................................................................*...................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + srshr v14.4S, v28.4S, #23 // ........................................................................................................................................*................................... + mls v9.4S, v13.4S, v8.S[0] // ................................................................................................................................*........................................... + mls v15.4S, v0.4S, v8.S[0] // ..................................................................................................................................*......................................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mls v12.4S, v25.4S, v8.4S // .............................................................................................................................................*.............................. + mls v17.4S, v27.4S, v8.4S // .........................................................................................................................................*.................................. + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mls v2.4S, v31.4S, v8.4S // .................................................................................................................................*.......................................... + mls v28.4S, v14.4S, v8.4S // ..............................................................................................................................................*............................. + mls v19.4S, v22.4S, v8.S[0] // ............................................................................................................................................*............................... + mls v18.4S, v3.4S, v8.S[0] // ...............................................................................................................................................*............................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + sub v20.4S, v9.4S, v15.4S // ...........................................................................................................................................*................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + sub v13.4S, v2.4S, v17.4S // .................................................................................................................................................*.......................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + sub v29.4S, v12.4S, v28.4S // .......................................................................................................................................................*.................... + sub v0.4S, v19.4S, v18.4S // .....................................................................................................................................................*...................... + add v23.4S, v12.4S, v28.4S // ........................................................................................................................................................*................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mul v10.4S, v20.4S, v5.S[0] // ...................................................................................................................................................*........................ + sqrdmulh v14.4S, v20.4S, v5.S[1] // ....................................................................................................................................................*....................... + mul v24.4S, v13.4S, v5.S[0] // ..........................................................................................................................................................*................. + sqrdmulh v13.4S, v13.4S, v5.S[1] // ...........................................................................................................................................................*................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + sqrdmulh v28.4S, v29.4S, v5.S[1] // ................................................................................................................................................................*........... + mul v27.4S, v29.4S, v5.S[0] // ...............................................................................................................................................................*............ + mul v29.4S, v0.4S, v5.S[0] // .............................................................................................................................................................*.............. + sqrdmulh v0.4S, v0.4S, v5.S[1] // ..............................................................................................................................................................*............. + str q23, [x1, #16] // ..................................................................................................................................................................*......... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + add v12.4S, v9.4S, v15.4S // ..........................................................................................................................................*................................. + add v5.4S, v2.4S, v17.4S // ................................................................................................................................................*........................... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + add v19.4S, v19.4S, v18.4S // ......................................................................................................................................................*..................... + mls v10.4S, v14.4S, v8.S[0] // ............................................................................................................................................................*............... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + mls v24.4S, v13.4S, v8.S[0] // ....................................................................................................................................................................*....... + mls v27.4S, v28.4S, v8.S[0] // .......................................................................................................................................................................*.... + mls v29.4S, v0.4S, v8.S[0] // ......................................................................................................................................................................*..... + str q12, [x1, #32] // ..................................................................................................................................................*......................... + str q5, [x1], #(16*4) // .........................................................................................................................................................*.................. + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + str q19, [x1, #-16] // .................................................................................................................................................................*.......... + add x1, x1, #64 // ...................................................................................................................................................................*........ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + str q10, [x2, #32] // .....................................................................................................................................................................*...... + str q24, [x2], #(16*4) // ........................................................................................................................................................................*... + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + str q27, [x2, #-48] // ..........................................................................................................................................................................*. + str q29, [x2, #-16] // .........................................................................................................................................................................*.. + // gap // ............................................................................................................................................................................ + add x2, x2, #64 // ...........................................................................................................................................................................* + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ + // gap // ............................................................................................................................................................................ // original source code - // ldr q13, [x2, #48] // ......*......................................................................................................................................................... - // ldr q19, [x2, #16] // ....*........................................................................................................................................................... - // trn1 v25.4S, v20.4S, v0.4S // *............................................................................................................................................................... - // ldr q5, [x2, #0] // .....*.......................................................................................................................................................... - // ldr q31, [x2, #32] // .......*........................................................................................................................................................ - // trn1 v7.4S, v27.4S, v1.4S // ..*............................................................................................................................................................. - // trn2 v27.4S, v27.4S, v1.4S // ...*............................................................................................................................................................ - // ldr q1, [x5, #176] // .........*...................................................................................................................................................... - // trn2 v14.4S, v20.4S, v0.4S // .*.............................................................................................................................................................. - // ldr q12, [x5, #144] // .................*.............................................................................................................................................. - // trn2 v10.2D, v7.2D, v25.2D // ..........*..................................................................................................................................................... - // trn1 v29.2D, v7.2D, v25.2D // ...........*.................................................................................................................................................... - // ldr q11, [x5, #80] // ........*....................................................................................................................................................... - // ldr q9, [x5, #32] // ..............*................................................................................................................................................. - // ldr q28, [x5, #64] // ...............*................................................................................................................................................ - // ldr q22, [x5, #48] // ................*............................................................................................................................................... - // trn2 v24.2D, v27.2D, v14.2D // ............*................................................................................................................................................... - // trn1 v21.2D, v27.2D, v14.2D // .............*.................................................................................................................................................. - // trn1 v14.4S, v31.4S, v13.4S // ............................*................................................................................................................................... - // trn2 v15.4S, v31.4S, v13.4S // ...........................*.................................................................................................................................... - // trn2 v13.4S, v5.4S, v19.4S // ....................*........................................................................................................................................... - // trn1 v3.4S, v5.4S, v19.4S // .....................*.......................................................................................................................................... - // ldr q5, [x5, #160] // ..................*............................................................................................................................................. - // sub v2.4S, v29.4S, v21.4S // .......................*........................................................................................................................................ - // sub v16.4S, v10.4S, v24.4S // ......................*......................................................................................................................................... - // trn1 v0.2D, v3.2D, v14.2D // .........................................*...................................................................................................................... - // trn2 v23.2D, v3.2D, v14.2D // ........................................*....................................................................................................................... - // trn2 v6.2D, v13.2D, v15.2D // ......................................*......................................................................................................................... - // trn1 v26.2D, v13.2D, v15.2D // .......................................*........................................................................................................................ - // ldr q31, [x5, #112] // ...................*............................................................................................................................................ - // mul v27.4S, v16.4S, v28.4S // ....................................*........................................................................................................................... - // mul v14.4S, v2.4S, v9.4S // ...................................*............................................................................................................................ - // sqrdmulh v13.4S, v2.4S, v22.4S // .....................................*.......................................................................................................................... - // add v15.4S, v0.4S, v26.4S // .................................................*.............................................................................................................. - // add v9.4S, v29.4S, v21.4S // ..............................*................................................................................................................................. - // sub v19.4S, v0.4S, v26.4S // ..............................................*................................................................................................................. - // ldr q20, [x5], #(12*16) // ........................*....................................................................................................................................... - // sub v28.4S, v23.4S, v6.4S // ...............................................*................................................................................................................ - // sqrdmulh v25.4S, v16.4S, v11.4S // ..................................*............................................................................................................................. - // add v2.4S, v10.4S, v24.4S // .............................*.................................................................................................................................. - // add v16.4S, v23.4S, v6.4S // ................................................*............................................................................................................... - // ldr q10, [x4, #48] // ..........................*..................................................................................................................................... - // mul v21.4S, v28.4S, v5.4S // .......................................................*........................................................................................................ - // sqrdmulh v3.4S, v28.4S, v1.4S // .....................................................*.......................................................................................................... - // sqrdmulh v4.4S, v19.4S, v12.4S // ......................................................*......................................................................................................... - // mul v22.4S, v19.4S, v18.4S // ....................................................*........................................................................................................... - // add v5.4S, v15.4S, v16.4S // ...........................................................*.................................................................................................... - // sub v15.4S, v15.4S, v16.4S // ..........................................................*..................................................................................................... - // ldr q26, [x5, #-96] // .........................*...................................................................................................................................... - // mls v27.4S, v25.4S, v8.S[0] // ............................................*................................................................................................................... - // mls v14.4S, v13.4S, v8.S[0] // .............................................*.................................................................................................................. - // sub v12.4S, v9.4S, v2.4S // ..........................................*..................................................................................................................... - // mls v22.4S, v4.4S, v8.S[0] // ..............................................................*................................................................................................. - // mls v21.4S, v3.4S, v8.S[0] // ...............................................................*................................................................................................ - // add v28.4S, v9.4S, v2.4S // ...........................................*.................................................................................................................... - // mul v19.4S, v12.4S, v20.4S // ..................................................*............................................................................................................. - // sqrdmulh v11.4S, v12.4S, v30.4S // ...................................................*............................................................................................................ - // add v0.4S, v14.4S, v27.4S // .........................................................*...................................................................................................... - // sub v6.4S, v14.4S, v27.4S // ........................................................*....................................................................................................... - // sqrdmulh v14.4S, v15.4S, v31.4S // ................................................................*............................................................................................... - // sub v29.4S, v22.4S, v21.4S // .....................................................................*.......................................................................................... - // add v1.4S, v22.4S, v21.4S // ......................................................................*......................................................................................... - // mul v9.4S, v15.4S, v26.4S // ....................................................................*........................................................................................... - // trn1 v24.4S, v28.4S, v0.4S // ..................................................................*............................................................................................. - // mul v25.4S, v6.4S, v20.4S // .................................................................*.............................................................................................. - // sqrdmulh v17.4S, v6.4S, v30.4S // .............................................................*.................................................................................................. - // mls v19.4S, v11.4S, v8.S[0] // ............................................................*................................................................................................... - // trn2 v22.4S, v5.4S, v1.4S // ...........................................................................*.................................................................................... - // mul v4.4S, v29.4S, v26.4S // ..........................................................................*..................................................................................... - // sqrdmulh v26.4S, v29.4S, v31.4S // .........................................................................*...................................................................................... - // trn2 v21.4S, v28.4S, v0.4S // ...................................................................*............................................................................................ - // mls v25.4S, v17.4S, v8.S[0] // .......................................................................*........................................................................................ - // mls v4.4S, v26.4S, v8.S[0] // ...............................................................................*................................................................................ - // mls v9.4S, v14.4S, v8.S[0] // ........................................................................*....................................................................................... - // trn1 v7.4S, v5.4S, v1.4S // ............................................................................*................................................................................... - // trn1 v30.4S, v19.4S, v25.4S // .............................................................................*.................................................................................. - // trn2 v11.4S, v19.4S, v25.4S // ..............................................................................*................................................................................. - // ldr q25, [x4, #16] // ................................*............................................................................................................................... - // trn2 v27.4S, v9.4S, v4.4S // ....................................................................................*........................................................................... - // trn1 v0.4S, v9.4S, v4.4S // .....................................................................................*.......................................................................... - // trn2 v14.2D, v24.2D, v30.2D // ................................................................................*............................................................................... - // trn2 v28.2D, v21.2D, v11.2D // ..................................................................................*............................................................................. - // trn1 v29.2D, v24.2D, v30.2D // .................................................................................*.............................................................................. - // trn1 v17.2D, v21.2D, v11.2D // ...................................................................................*............................................................................ - // ldr q11, [x4, #32] // ...............................*................................................................................................................................ - // trn1 v1.2D, v7.2D, v0.2D // ............................................................................................*................................................................... - // trn1 v12.2D, v22.2D, v27.2D // ..........................................................................................*..................................................................... - // trn2 v30.2D, v7.2D, v0.2D // .............................................................................................*.................................................................. - // trn2 v13.2D, v22.2D, v27.2D // ...........................................................................................*.................................................................... - // sub v31.4S, v14.4S, v28.4S // ......................................................................................*......................................................................... - // sub v20.4S, v29.4S, v17.4S // .......................................................................................*........................................................................ - // add v3.4S, v14.4S, v28.4S // ........................................................................................*....................................................................... - // sub v18.4S, v1.4S, v12.4S // ..................................................................................................*............................................................. - // sub v9.4S, v30.4S, v13.4S // ....................................................................................................*........................................................... - // add v0.4S, v1.4S, v12.4S // ...................................................................................................*............................................................ - // mul v22.4S, v31.4S, v11.S[0] // ..............................................................................................*................................................................. - // sqrdmulh v4.4S, v31.4S, v11.S[1] // ...............................................................................................*................................................................ - // mul v6.4S, v20.4S, v25.S[2] // ................................................................................................*............................................................... - // sqrdmulh v27.4S, v20.4S, v25.S[3] // .................................................................................................*.............................................................. - // mul v16.4S, v18.4S, v11.S[2] // ..........................................................................................................*..................................................... - // mul v26.4S, v9.4S, v10.S[0] // ........................................................................................................*....................................................... - // sqrdmulh v20.4S, v18.4S, v11.S[3] // ...........................................................................................................*.................................................... - // sqrdmulh v18.4S, v9.4S, v10.S[1] // .........................................................................................................*...................................................... - // add v14.4S, v29.4S, v17.4S // .........................................................................................*...................................................................... - // add v10.4S, v30.4S, v13.4S // .....................................................................................................*.......................................................... - // mls v22.4S, v4.4S, v8.S[0] // ............................................................................................................*................................................... - // sub v30.4S, v14.4S, v3.4S // ......................................................................................................*......................................................... - // mls v6.4S, v27.4S, v8.S[0] // .............................................................................................................*.................................................. - // mls v26.4S, v18.4S, v8.S[0] // ..................................................................................................................*............................................. - // mls v16.4S, v20.4S, v8.S[0] // ...................................................................................................................*............................................ - // ldr q18, [x4], #64 // .................................*.............................................................................................................................. - // sub v13.4S, v0.4S, v10.4S // ..............................................................................................................*................................................. - // add v3.4S, v14.4S, v3.4S // .......................................................................................................*........................................................ - // add v31.4S, v0.4S, v10.4S // ...............................................................................................................*................................................ - // sub v17.4S, v16.4S, v26.4S // ...............................................................................................................................*................................ - // sub v29.4S, v6.4S, v22.4S // ........................................................................................................................*....................................... - // mul v9.4S, v13.4S, v25.S[0] // ....................................................................................................................*........................................... - // sqrdmulh v14.4S, v13.4S, v25.S[1] // .....................................................................................................................*.......................................... - // mul v10.4S, v30.4S, v18.S[2] // ................................................................................................................*............................................... - // sqrdmulh v19.4S, v30.4S, v18.S[3] // .................................................................................................................*.............................................. - // sub v1.4S, v3.4S, v31.4S // .......................................................................................................................*........................................ - // sqrdmulh v0.4S, v17.4S, v25.S[1] // ...................................................................................................................................*............................ - // mul v25.4S, v17.4S, v25.S[0] // ....................................................................................................................................*........................... - // mul v27.4S, v29.4S, v18.S[2] // .............................................................................................................................*.................................. - // sqrdmulh v20.4S, v29.4S, v18.S[3] // ..............................................................................................................................*................................. - // mls v9.4S, v14.4S, v8.S[0] // ...........................................................................................................................*.................................... - // add v3.4S, v3.4S, v31.4S // ......................................................................................................................*......................................... - // mul v4.4S, v1.4S, v18.S[0] // .................................................................................................................................*.............................. - // add v7.4S, v6.4S, v22.4S // .........................................................................................................................*...................................... - // mls v10.4S, v19.4S, v8.S[0] // ..........................................................................................................................*..................................... - // sqrdmulh v24.4S, v1.4S, v18.S[1] // ..................................................................................................................................*............................. - // add v19.4S, v16.4S, v26.4S // ................................................................................................................................*............................... - // mls v25.4S, v0.4S, v8.S[0] // ...........................................................................................................................................*.................... - // mls v27.4S, v20.4S, v8.S[0] // .......................................................................................................................................*........................ - // str q3, [x1], #(16*4) // ............................................................................................................................*................................... - // add v2.4S, v7.4S, v19.4S // .....................................................................................................................................*.......................... - // sub v13.4S, v7.4S, v19.4S // ......................................................................................................................................*......................... - // sub v5.4S, v10.4S, v9.4S // ........................................................................................................................................*....................... - // add v19.4S, v10.4S, v9.4S // .........................................................................................................................................*...................... - // mls v4.4S, v24.4S, v8.S[0] // ..........................................................................................................................................*..................... - // sub v9.4S, v27.4S, v25.4S // ...................................................................................................................................................*............ - // add v31.4S, v27.4S, v25.4S // ....................................................................................................................................................*........... - // str q2, [x1, #-48] // ............................................................................................................................................*................... - // sqrdmulh v14.4S, v13.4S, v18.S[1] // .............................................................................................................................................*.................. - // mul v17.4S, v13.4S, v18.S[0] // ..............................................................................................................................................*................. - // str q19, [x1, #-32] // .................................................................................................................................................*.............. - // mul v19.4S, v5.4S, v18.S[0] // ...............................................................................................................................................*................ - // str q31, [x1, #-16] // .......................................................................................................................................................*........ - // sqrdmulh v31.4S, v5.4S, v18.S[1] // ................................................................................................................................................*............... - // mul v28.4S, v9.4S, v18.S[0] // .........................................................................................................................................................*...... - // sqrdmulh v5.4S, v9.4S, v18.S[1] // ..........................................................................................................................................................*..... - // add x1, x1, #64 // ........................................................................................................................................................*....... - // str q4, [x2], #(16*4) // ..................................................................................................................................................*............. - // mls v17.4S, v14.4S, v8.S[0] // .....................................................................................................................................................*.......... - // mls v19.4S, v31.4S, v8.S[0] // ......................................................................................................................................................*......... - // mls v28.4S, v5.4S, v8.S[0] // .............................................................................................................................................................*.. - // str q17, [x2, #-48] // ...........................................................................................................................................................*.... - // str q19, [x2, #-32] // ............................................................................................................................................................*... - // str q28, [x2, #-16] // ..............................................................................................................................................................*. - // add x2, x2, #64 // ...............................................................................................................................................................* + // ldr q29, [x1, #0] // ..*......................................................................................................................................................................... + // ldr q19, [x1, #16] // ...*........................................................................................................................................................................ + // ldr q5, [x2, #0] // .....*...................................................................................................................................................................... + // trn1 v9.4S, v0.4S, v18.4S // ............*............................................................................................................................................................... + // trn2 v0.4S, v0.4S, v18.4S // .*.......................................................................................................................................................................... + // ldr q2, [x2, #16] // ......*..................................................................................................................................................................... + // ldr q12, [x2, #32] // *........................................................................................................................................................................... + // ldr q15, [x2, #48] // ....*....................................................................................................................................................................... + // ldr q13, [x5, #32] // .................*.......................................................................................................................................................... + // ldr q18, [x5], #(12*16) // ........*................................................................................................................................................................... + // ldr q27, [x5, #-176] // ................................................*........................................................................................................................... + // ldr q17, [x5, #-144] // ...........*................................................................................................................................................................ + // ldr q31, [x5, #-128] // ......................*..................................................................................................................................................... + // ldr q25, [x5, #-112] // .......................*.................................................................................................................................................... + // trn1 v28.4S, v29.4S, v19.4S // .........*.................................................................................................................................................................. + // trn2 v29.4S, v29.4S, v19.4S // ..........*................................................................................................................................................................. + // ldr q19, [x5, #-64] // ........................*................................................................................................................................................... + // ldr q3, [x5, #-96] // ......................................................*..................................................................................................................... + // ldr q22, [x5, #-80] // .....................................*...................................................................................................................................... + // trn1 v1.4S, v5.4S, v2.4S // ................*........................................................................................................................................................... + // trn2 v5.4S, v5.4S, v2.4S // .............*.............................................................................................................................................................. + // trn1 v2.4S, v12.4S, v15.4S // ..............*............................................................................................................................................................. + // trn2 v12.4S, v12.4S, v15.4S // ...............*............................................................................................................................................................ + // ldr q15, [x5, #-48] // .............................*.............................................................................................................................................. + // ldr q4, [x5, #-32] // .......*.................................................................................................................................................................... + // ldr q16, [x5, #-16] // ..............................*............................................................................................................................................. + // trn2 v20.2D, v28.2D, v9.2D // ..................*......................................................................................................................................................... + // trn1 v9.2D, v28.2D, v9.2D // ...................*........................................................................................................................................................ + // trn2 v28.2D, v29.2D, v0.2D // ....................*....................................................................................................................................................... + // trn1 v29.2D, v29.2D, v0.2D // .....................*...................................................................................................................................................... + // ldr q0, [x4, #32] // .............................................................................*.............................................................................................. + // ldr q11, [x4, #16] // ...............................................................*............................................................................................................ + // ldr q6, [x4], #64 // ...........................................................................................................*................................................................ + // trn2 v23.2D, v1.2D, v2.2D // ............................*............................................................................................................................................... + // trn1 v2.2D, v1.2D, v2.2D // ...........................*................................................................................................................................................ + // trn2 v1.2D, v5.2D, v12.2D // ..........................*................................................................................................................................................. + // trn1 v5.2D, v5.2D, v12.2D // .........................*.................................................................................................................................................. + // ldr q12, [x4, #-16] // ............................................................................................*............................................................................... + // sub v21.4S, v20.4S, v28.4S // ...............................*............................................................................................................................................ + // add v28.4S, v20.4S, v28.4S // ................................*........................................................................................................................................... + // sub v20.4S, v9.4S, v29.4S // .................................*.......................................................................................................................................... + // add v29.4S, v9.4S, v29.4S // .............................................*.............................................................................................................................. + // sub v9.4S, v23.4S, v1.4S // ...................................*........................................................................................................................................ + // add v1.4S, v23.4S, v1.4S // ..............................................*............................................................................................................................. + // sub v23.4S, v2.4S, v5.4S // ..................................*......................................................................................................................................... + // add v5.4S, v2.4S, v5.4S // .....................................................*...................................................................................................................... + // mul v2.4S, v20.4S, v13.4S // ....................................*....................................................................................................................................... + // sqrdmulh v13.4S, v20.4S, v17.4S // ......................................*..................................................................................................................................... + // mul v17.4S, v21.4S, v31.4S // .......................................*.................................................................................................................................... + // sqrdmulh v31.4S, v21.4S, v25.4S // ........................................*................................................................................................................................... + // mul v19.4S, v23.4S, v19.4S // ...........................................*................................................................................................................................ + // sqrdmulh v15.4S, v23.4S, v15.4S // ..........................................*................................................................................................................................. + // mul v25.4S, v9.4S, v4.4S // .........................................*.................................................................................................................................. + // sqrdmulh v9.4S, v9.4S, v16.4S // ............................................*............................................................................................................................... + // sub v4.4S, v5.4S, v1.4S // ..........................................................*................................................................................................................. + // add v5.4S, v5.4S, v1.4S // ....................................................................*....................................................................................................... + // sub v1.4S, v29.4S, v28.4S // ....................................................*....................................................................................................................... + // add v29.4S, v29.4S, v28.4S // .....................................................................*...................................................................................................... + // mls v2.4S, v13.4S, v8.S[0] // ...............................................*............................................................................................................................ + // mls v17.4S, v31.4S, v8.S[0] // .................................................*.......................................................................................................................... + // mls v19.4S, v15.4S, v8.S[0] // ...................................................*........................................................................................................................ + // mls v25.4S, v9.4S, v8.S[0] // ..................................................*......................................................................................................................... + // mul v9.4S, v1.4S, v18.4S // .........................................................*.................................................................................................................. + // sqrdmulh v15.4S, v1.4S, v27.4S // .............................................................*.............................................................................................................. + // mul v13.4S, v4.4S, v3.4S // ..................................................................*......................................................................................................... + // sqrdmulh v31.4S, v4.4S, v22.4S // ................................................................*........................................................................................................... + // sub v28.4S, v2.4S, v17.4S // ........................................................*................................................................................................................... + // add v2.4S, v2.4S, v17.4S // .......................................................*.................................................................................................................... + // sub v17.4S, v19.4S, v25.4S // ...........................................................*................................................................................................................ + // add v19.4S, v19.4S, v25.4S // .........................................................................*.................................................................................................. + // mls v9.4S, v15.4S, v8.S[0] // .......................................................................*.................................................................................................... + // mul v15.4S, v28.4S, v18.4S // ............................................................*............................................................................................................... + // sqrdmulh v18.4S, v28.4S, v27.4S // ..............................................................*............................................................................................................. + // mls v13.4S, v31.4S, v8.S[0] // ........................................................................*................................................................................................... + // trn1 v27.4S, v29.4S, v2.4S // ...........................................................................*................................................................................................ + // mul v31.4S, v17.4S, v3.4S // ...................................................................*........................................................................................................ + // sqrdmulh v17.4S, v17.4S, v22.4S // .................................................................*.......................................................................................................... + // trn2 v29.4S, v29.4S, v2.4S // ............................................................................*............................................................................................... + // trn1 v2.4S, v5.4S, v19.4S // ................................................................................*........................................................................................... + // trn2 v19.4S, v5.4S, v19.4S // .................................................................................*.......................................................................................... + // mls v15.4S, v18.4S, v8.S[0] // ......................................................................*..................................................................................................... + // mls v31.4S, v17.4S, v8.S[0] // ..........................................................................*................................................................................................. + // trn1 v5.4S, v9.4S, v15.4S // ...............................................................................*............................................................................................ + // trn2 v9.4S, v9.4S, v15.4S // ..............................................................................*............................................................................................. + // trn1 v15.4S, v13.4S, v31.4S // ...................................................................................*........................................................................................ + // trn2 v13.4S, v13.4S, v31.4S // ..................................................................................*......................................................................................... + // trn2 v18.2D, v27.2D, v5.2D // ......................................................................................*..................................................................................... + // trn1 v5.2D, v27.2D, v5.2D // .......................................................................................*.................................................................................... + // trn2 v27.2D, v29.2D, v9.2D // .....................................................................................*...................................................................................... + // trn1 v29.2D, v29.2D, v9.2D // ....................................................................................*....................................................................................... + // trn2 v9.2D, v2.2D, v15.2D // ...........................................................................................*................................................................................ + // trn1 v2.2D, v2.2D, v15.2D // ..........................................................................................*................................................................................. + // trn2 v15.2D, v19.2D, v13.2D // ........................................................................................*................................................................................... + // trn1 v19.2D, v19.2D, v13.2D // .........................................................................................*.................................................................................. + // sub v13.4S, v18.4S, v27.4S // .............................................................................................*.............................................................................. + // add v18.4S, v18.4S, v27.4S // ..............................................................................................*............................................................................. + // sub v27.4S, v5.4S, v29.4S // ...............................................................................................*............................................................................ + // add v29.4S, v5.4S, v29.4S // ................................................................................................*........................................................................... + // sub v5.4S, v9.4S, v15.4S // .................................................................................................*.......................................................................... + // add v15.4S, v9.4S, v15.4S // ............................................................................................................*............................................................... + // sub v17.4S, v2.4S, v19.4S // ..................................................................................................*......................................................................... + // add v19.4S, v2.4S, v19.4S // .............................................................................................................*.............................................................. + // mul v2.4S, v13.4S, v0.S[0] // ....................................................................................................*....................................................................... + // sqrdmulh v13.4S, v13.4S, v0.S[1] // ...................................................................................................*........................................................................ + // mul v9.4S, v27.4S, v11.S[2] // ......................................................................................................*..................................................................... + // sqrdmulh v27.4S, v27.4S, v11.S[3] // .....................................................................................................*...................................................................... + // mul v31.4S, v17.4S, v0.S[2] // .........................................................................................................*.................................................................. + // sqrdmulh v0.4S, v17.4S, v0.S[3] // ..........................................................................................................*................................................................. + // mul v17.4S, v5.4S, v12.S[0] // .......................................................................................................*.................................................................... + // sqrdmulh v5.4S, v5.4S, v12.S[1] // ........................................................................................................*................................................................... + // sub v12.4S, v29.4S, v18.4S // ................................................................................................................*........................................................... + // add v29.4S, v29.4S, v18.4S // ......................................................................................................................*..................................................... + // sub v18.4S, v19.4S, v15.4S // ..................................................................................................................*......................................................... + // add v19.4S, v19.4S, v15.4S // ....................................................................................................................*....................................................... + // mls v9.4S, v27.4S, v8.S[0] // ..............................................................................................................*............................................................. + // mls v2.4S, v13.4S, v8.S[0] // ...............................................................................................................*............................................................ + // mls v31.4S, v0.4S, v8.S[0] // .................................................................................................................*.......................................................... + // mls v17.4S, v5.4S, v8.S[0] // ...................................................................................................................*........................................................ + // mul v5.4S, v12.4S, v6.S[2] // .........................................................................................................................*.................................................. + // sqrdmulh v0.4S, v12.4S, v6.S[3] // .....................................................................................................................*...................................................... + // mul v12.4S, v18.4S, v11.S[0] // ............................................................................................................................*............................................... + // sqrdmulh v15.4S, v18.4S, v11.S[1] // .............................................................................................................................*.............................................. + // srshr v13.4S, v29.4S, #23 // ................................................................................................................................*........................................... + // srshr v18.4S, v19.4S, #23 // ..............................................................................................................................*............................................. + // sub v27.4S, v9.4S, v2.4S // .......................................................................................................................*.................................................... + // add v9.4S, v9.4S, v2.4S // ........................................................................................................................*................................................... + // sub v2.4S, v31.4S, v17.4S // ..........................................................................................................................*................................................. + // add v17.4S, v31.4S, v17.4S // ...........................................................................................................................*................................................ + // mls v5.4S, v0.4S, v8.S[0] // ......................................................................................................................................*..................................... + // mls v29.4S, v13.4S, v8.4S // ..........................................................................................................................................*................................. + // mls v12.4S, v15.4S, v8.S[0] // .......................................................................................................................................*.................................... + // mul v0.4S, v27.4S, v6.S[2] // ..................................................................................................................................*......................................... + // sqrdmulh v15.4S, v27.4S, v6.S[3] // .................................................................................................................................*.......................................... + // mul v13.4S, v2.4S, v11.S[0] // ....................................................................................................................................*....................................... + // sqrdmulh v2.4S, v2.4S, v11.S[1] // ...................................................................................................................................*........................................ + // srshr v27.4S, v9.4S, #23 // ...............................................................................................................................*............................................ + // srshr v31.4S, v17.4S, #23 // .....................................................................................................................................*...................................... + // mls v19.4S, v18.4S, v8.4S // .........................................................................................................................................*.................................. + // add v18.4S, v5.4S, v12.4S // ............................................................................................................................................................*............... + // sub v5.4S, v5.4S, v12.4S // ..............................................................................................................................................*............................. + // mls v0.4S, v15.4S, v8.S[0] // ............................................................................................................................................*............................... + // mls v9.4S, v27.4S, v8.4S // ........................................................................................................................................*................................... + // mls v17.4S, v31.4S, v8.4S // ...........................................................................................................................................*................................ + // mls v13.4S, v2.4S, v8.S[0] // .............................................................................................................................................*.............................. + // add v2.4S, v29.4S, v19.4S // .............................................................................................................................................................*.............. + // sub v29.4S, v29.4S, v19.4S // ...............................................................................................................................................*............................ + // str q18, [x1, #32] // ...................................................................................................................................................................*........ + // mul v19.4S, v5.4S, v6.S[0] // ...................................................................................................................................................*........................ + // sqrdmulh v5.4S, v5.4S, v6.S[1] // ....................................................................................................................................................*....................... + // sub v12.4S, v0.4S, v13.4S // .................................................................................................................................................*.......................... + // add v0.4S, v0.4S, v13.4S // ..............................................................................................................................................................*............. + // sub v15.4S, v9.4S, v17.4S // ................................................................................................................................................*........................... + // add v9.4S, v9.4S, v17.4S // ..................................................................................................................................................*......................... + // str q2, [x1], #(16*4) // ....................................................................................................................................................................*....... + // mul v2.4S, v29.4S, v6.S[0] // .....................................................................................................................................................*...................... + // sqrdmulh v29.4S, v29.4S, v6.S[1] // ......................................................................................................................................................*..................... + // mls v19.4S, v5.4S, v8.S[0] // ...............................................................................................................................................................*............ + // mul v5.4S, v12.4S, v6.S[0] // .........................................................................................................................................................*.................. + // sqrdmulh v12.4S, v12.4S, v6.S[1] // ..........................................................................................................................................................*................. + // mul v13.4S, v15.4S, v6.S[0] // ........................................................................................................................................................*................... + // sqrdmulh v15.4S, v15.4S, v6.S[1] // .......................................................................................................................................................*.................... + // str q0, [x1, #-16] // .....................................................................................................................................................................*...... + // str q9, [x1, #-48] // ...........................................................................................................................................................*................ + // add x1, x1, #64 // ......................................................................................................................................................................*..... + // mls v2.4S, v29.4S, v8.S[0] // ................................................................................................................................................................*........... + // str q19, [x2, #32] // .......................................................................................................................................................................*.... + // mls v5.4S, v12.4S, v8.S[0] // ..................................................................................................................................................................*......... + // mls v13.4S, v15.4S, v8.S[0] // .................................................................................................................................................................*.......... + // str q2, [x2], #(16*4) // ........................................................................................................................................................................*... + // str q5, [x2, #-16] // ..........................................................................................................................................................................*. + // str q13, [x2, #-48] // .........................................................................................................................................................................*.. + // add x2, x2, #64 // ...........................................................................................................................................................................* // ----------------------------------------------------------------------------- ninv .req v25 ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 ASM_LOAD(xtmp, ninv_addr) ld1r {ninv.4s}, [xtmp] ASM_LOAD(xtmp, ninv_tw_addr) ld1r {ninv_tw.4s}, [xtmp] + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + mov count, #8 ASM_LOAD(r_ptr0, roots_l012) load_roots_123 .p2align 2 - ldr q18, [x0, #384] // .*.................................................. - ldr q10, [x0, #256] // ..*................................................. - ldr q22, [x0, #640] // *................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - ldr q30, [x0, #512] // ...*................................................ - ldr q27, [x0, #896] // ....*............................................... - ldr q13, [x0, #768] // .....*.............................................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - ldr q15, [x0, #0] // ......*............................................. - ldr q24, [x0, #128] // .......*............................................ - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v6.4S, v10.4S, v18.4S // ...............*.................................... - add v10.4S, v10.4S, v18.4S // ........*........................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v7.4S, v13.4S, v27.4S // .........*.......................................... - sub v18.4S, v30.4S, v22.4S // ............*....................................... - add v22.4S, v30.4S, v22.4S // ...........*........................................ - add v30.4S, v13.4S, v27.4S // ..........*......................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v27.4S, v15.4S, v24.4S // .............*...................................... - add v13.4S, v15.4S, v24.4S // ..............*..................................... - mul v15.4S, v6.4S, v2.S[0] // .......................*............................ - sqrdmulh v24.4S, v6.4S, v2.S[1] // .........................*.......................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mul v6.4S, v7.4S, v3.S[0] // ..................*................................. - sqrdmulh v7.4S, v7.4S, v3.S[1] // ...................*................................ - sub v11.4S, v22.4S, v30.4S // ................*................................... - mul v23.4S, v18.4S, v2.S[2] // .................*.................................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sqrdmulh v18.4S, v18.4S, v2.S[3] // .....................*.............................. - sqrdmulh v12.4S, v27.4S, v1.S[3] // ......................*............................. - mul v27.4S, v27.4S, v1.S[2] // ........................*........................... - sub v31.4S, v13.4S, v10.4S // ....................*............................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - add v10.4S, v13.4S, v10.4S // ........................................*........... - mls v15.4S, v24.4S, v8.S[0] // ..............................*..................... - sqrdmulh v13.4S, v11.4S, v1.S[1] // ..........................*......................... - mul v24.4S, v11.4S, v1.S[0] // ............................*....................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - add v22.4S, v22.4S, v30.4S // .........................................*.......... - mls v6.4S, v7.4S, v8.S[0] // .............................*...................... - sqrdmulh v7.4S, v31.4S, v0.S[3] // ...............................*.................... - mul v30.4S, v31.4S, v0.S[2] // .................................*.................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v23.4S, v18.4S, v8.S[0] // ...........................*........................ - mls v27.4S, v12.4S, v8.S[0] // ................................*................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v16.4S, v10.4S, v22.4S // ................................................*... - add v17.4S, v10.4S, v22.4S // .................................................*.. - mls v24.4S, v13.4S, v8.S[0] // ....................................*............... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v30.4S, v7.4S, v8.S[0] // .......................................*............ - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v10.4S, v27.4S, v15.4S // .....................................*.............. - add v22.4S, v27.4S, v15.4S // ......................................*............. - add v7.4S, v23.4S, v6.4S // ..................................*................. - sub v13.4S, v23.4S, v6.4S // ...................................*................ - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sqrdmulh v19.4S, v10.4S, v0.S[3] // ...........................................*........ - mul v27.4S, v10.4S, v0.S[2] // .............................................*...... - mul v6.4S, v13.4S, v1.S[0] // ..........................................*......... - sub v20.4S, v22.4S, v7.4S // ............................................*....... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v12.4S, v30.4S, v24.4S // ..................................................*. - add v18.4S, v30.4S, v24.4S // ...................................................* - add v21.4S, v22.4S, v7.4S // ..............................................*..... - sqrdmulh v30.4S, v13.4S, v1.S[1] // ...............................................*.... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... + ldr q5, [x0, #0] // .*................................................................................. + ldr q19, [x0, #128] // ..*................................................................................ + ldr q9, [x0, #384] // *.................................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + ldr q12, [x0, #256] // ....*.............................................................................. + ldr q15, [x0, #768] // ...*............................................................................... + ldr q13, [x0, #896] // .....*............................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + ldr q18, [x0, #640] // ......*............................................................................ + ldr q27, [x0, #512] // .......*........................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v17.4S, v5.4S, v19.4S // .........*......................................................................... + add v19.4S, v5.4S, v19.4S // ........*.......................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v5.4S, v12.4S, v9.4S // ............*...................................................................... + add v9.4S, v12.4S, v9.4S // .............*..................................................................... + sub v12.4S, v15.4S, v13.4S // ...........*....................................................................... + add v15.4S, v15.4S, v13.4S // ..........*........................................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v13.4S, v27.4S, v18.4S // .................*................................................................. + add v18.4S, v27.4S, v18.4S // ................*.................................................................. + sqrdmulh v27.4S, v17.4S, v1.S[3] // ..............*.................................................................... + mul v17.4S, v17.4S, v1.S[2] // ...............*................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v28.4S, v5.4S, v2.S[0] // ....................*.............................................................. + sqrdmulh v5.4S, v5.4S, v2.S[1] // .....................*............................................................. + sub v22.4S, v19.4S, v9.4S // ...................*............................................................... + sqrdmulh v20.4S, v12.4S, v3.S[1] // ..................*................................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v12.4S, v12.4S, v3.S[0] // .......................*........................................................... + mul v11.4S, v13.4S, v2.S[2] // ......................*............................................................ + sqrdmulh v13.4S, v13.4S, v2.S[3] // ........................*.......................................................... + add v16.4S, v18.4S, v15.4S // .........................*......................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v4.4S, v19.4S, v9.4S // ..........................*........................................................ + sub v19.4S, v18.4S, v15.4S // .............................*..................................................... + mul v9.4S, v22.4S, v0.S[2] // ...........................*....................................................... + sqrdmulh v15.4S, v22.4S, v0.S[3] // ............................*...................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v28.4S, v5.4S, v8.S[0] // ...............................*................................................... + mls v17.4S, v27.4S, v8.S[0] // ..............................*.................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v12.4S, v20.4S, v8.S[0] // ................................*.................................................. + mls v11.4S, v13.4S, v8.S[0] // ...................................*............................................... + sqrdmulh v5.4S, v19.4S, v1.S[1] // .................................*................................................. + mul v19.4S, v19.4S, v1.S[0] // ..................................*................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v9.4S, v15.4S, v8.S[0] // ....................................*.............................................. + add v15.4S, v4.4S, v16.4S // .....................................*............................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v13.4S, v17.4S, v28.4S // .......................................*........................................... + add v18.4S, v17.4S, v28.4S // ......................................*............................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v27.4S, v11.4S, v12.4S // .........................................*......................................... + sub v17.4S, v11.4S, v12.4S // ........................................*.......................................... + mls v19.4S, v5.4S, v8.S[0] // ...........................................*....................................... + sqrdmulh v5.4S, v15.4S, v26.4S // ..........................................*........................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v12.4S, v15.4S, v25.4S // ............................................*...................................... + sqrdmulh v15.4S, v13.4S, v0.S[3] // .............................................*..................................... + mul v28.4S, v13.4S, v0.S[2] // ..............................................*.................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v13.4S, v18.4S, v27.4S // .................................................*................................. + sub v18.4S, v18.4S, v27.4S // ..................................................*................................ + mul v27.4S, v17.4S, v1.S[0] // ...............................................*................................... + sqrdmulh v17.4S, v17.4S, v1.S[1] // ................................................*.................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v22.4S, v9.4S, v19.4S // ...................................................*............................... + sub v9.4S, v9.4S, v19.4S // ....................................................*.............................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v28.4S, v15.4S, v8.S[0] // .......................................................*........................... + mul v15.4S, v13.4S, v25.4S // ......................................................*............................ + sqrdmulh v20.4S, v13.4S, v26.4S // ........................................................*.......................... + sqrdmulh v11.4S, v18.4S, v0.S[1] // .....................................................*............................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v12.4S, v5.4S, v8.S[0] // ............................................................*...................... + mul v19.4S, v18.4S, v0.S[0] // .........................................................*......................... + mls v27.4S, v17.4S, v8.S[0] // ...........................................................*....................... + mul v13.4S, v22.4S, v25.4S // ..........................................................*........................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v18.4S, v22.4S, v26.4S // .............................................................*..................... + sqrdmulh v17.4S, v9.4S, v0.S[1] // ..............................................................*.................... + mul v5.4S, v9.4S, v0.S[0] // ...............................................................*................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v15.4S, v20.4S, v8.S[0] // ................................................................*.................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v9.4S, v28.4S, v27.4S // .................................................................*................. + add v27.4S, v28.4S, v27.4S // ..................................................................*................ + mls v19.4S, v11.4S, v8.S[0] // ...................................................................*............... + cmge v24.4S, v31.4S, v12.4S // .....................................................................*............. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v13.4S, v18.4S, v8.S[0] // ......................................................................*............ + mls v5.4S, v17.4S, v8.S[0] // ....................................................................*.............. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v20.4S, v9.4S, v0.S[1] // .........................................................................*......... + mul v9.4S, v9.4S, v0.S[0] // ........................................................................*.......... + sqrdmulh v10.4S, v27.4S, v26.4S // .......................................................................*........... + mul v18.4S, v27.4S, v25.4S // ..........................................................................*........ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + cmge v11.4S, v31.4S, v15.4S // ...........................................................................*....... + cmge v17.4S, v19.4S, v30.4S // ............................................................................*...... + cmge v27.4S, v15.4S, v30.4S // .............................................................................*..... + cmge v23.4S, v31.4S, v19.4S // ..............................................................................*.... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + cmge v6.4S, v5.4S, v30.4S // ...............................................................................*... + cmge v7.4S, v13.4S, v30.4S // ................................................................................*.. + cmge v28.4S, v31.4S, v5.4S // .................................................................................*. + cmge v22.4S, v31.4S, v13.4S // ..................................................................................* + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... // original source code - // ldr q23, [x0, #640] // ..*................................................. - // ldr q22, [x0, #384] // *................................................... - // ldr q5, [x0, #256] // .*.................................................. - // ldr q20, [x0, #512] // ...*................................................ - // ldr q28, [x0, #896] // ....*............................................... - // ldr q21, [x0, #768] // .....*.............................................. - // ldr q9, [x0, #0] // ......*............................................. - // ldr q16, [x0, #128] // .......*............................................ - // add v19.4S, v5.4S, v22.4S // .........*.......................................... - // sub v15.4S, v21.4S, v28.4S // ..........*......................................... - // add v4.4S, v21.4S, v28.4S // .............*...................................... - // add v6.4S, v20.4S, v23.4S // ............*....................................... - // sub v28.4S, v20.4S, v23.4S // ...........*........................................ - // sub v23.4S, v9.4S, v16.4S // ..............*..................................... - // add v12.4S, v9.4S, v16.4S // ...............*.................................... - // sub v9.4S, v5.4S, v22.4S // ........*........................................... - // sub v20.4S, v6.4S, v4.4S // ....................*............................... - // mul v27.4S, v28.4S, v2.S[2] // .....................*.............................. - // mul v13.4S, v15.4S, v3.S[0] // ..................*................................. - // sqrdmulh v21.4S, v15.4S, v3.S[1] // ...................*................................ - // sub v5.4S, v12.4S, v19.4S // .........................*.......................... - // sqrdmulh v15.4S, v28.4S, v2.S[3] // ......................*............................. - // sqrdmulh v28.4S, v23.4S, v1.S[3] // .......................*............................ - // mul v30.4S, v9.4S, v2.S[0] // ................*................................... - // mul v7.4S, v23.4S, v1.S[2] // ........................*........................... - // sqrdmulh v23.4S, v9.4S, v2.S[1] // .................*.................................. - // sqrdmulh v9.4S, v20.4S, v1.S[1] // ............................*....................... - // mls v27.4S, v15.4S, v8.S[0] // ..................................*................. - // mul v24.4S, v20.4S, v1.S[0] // .............................*...................... - // mls v13.4S, v21.4S, v8.S[0] // ...............................*.................... - // mls v30.4S, v23.4S, v8.S[0] // ...........................*........................ - // sqrdmulh v23.4S, v5.4S, v0.S[3] // ................................*................... - // mls v7.4S, v28.4S, v8.S[0] // ...................................*................ - // mul v15.4S, v5.4S, v0.S[2] // .................................*.................. - // add v28.4S, v27.4S, v13.4S // ..........................................*......... - // sub v13.4S, v27.4S, v13.4S // ...........................................*........ - // mls v24.4S, v9.4S, v8.S[0] // ......................................*............. - // sub v9.4S, v7.4S, v30.4S // ........................................*........... - // add v7.4S, v7.4S, v30.4S // .........................................*.......... - // mls v15.4S, v23.4S, v8.S[0] // .......................................*............ - // add v12.4S, v12.4S, v19.4S // ..........................*......................... - // add v5.4S, v6.4S, v4.4S // ..............................*..................... - // mul v6.4S, v13.4S, v1.S[0] // ..............................................*..... - // sqrdmulh v19.4S, v9.4S, v0.S[3] // ............................................*....... - // sub v20.4S, v7.4S, v28.4S // ...............................................*.... - // mul v27.4S, v9.4S, v0.S[2] // .............................................*...... - // add v21.4S, v7.4S, v28.4S // ..................................................*. - // sqrdmulh v30.4S, v13.4S, v1.S[1] // ...................................................* - // sub v16.4S, v12.4S, v5.4S // ....................................*............... - // add v17.4S, v12.4S, v5.4S // .....................................*.............. - // sub v12.4S, v15.4S, v24.4S // ................................................*... - // add v18.4S, v15.4S, v24.4S // .................................................*.. + // ldr q16, [x0, #384] // ..*................................................................................ + // ldr q17, [x0, #0] // *.................................................................................. + // ldr q23, [x0, #128] // .*................................................................................. + // ldr q27, [x0, #768] // ....*.............................................................................. + // ldr q21, [x0, #256] // ...*............................................................................... + // ldr q28, [x0, #896] // .....*............................................................................. + // ldr q20, [x0, #640] // ......*............................................................................ + // ldr q22, [x0, #512] // .......*........................................................................... + // add v4.4S, v17.4S, v23.4S // .........*......................................................................... + // sub v23.4S, v17.4S, v23.4S // ........*.......................................................................... + // add v17.4S, v27.4S, v28.4S // .............*..................................................................... + // sub v19.4S, v27.4S, v28.4S // ............*...................................................................... + // sub v28.4S, v21.4S, v16.4S // ..........*........................................................................ + // add v15.4S, v21.4S, v16.4S // ...........*....................................................................... + // sqrdmulh v21.4S, v23.4S, v1.S[3] // ................*.................................................................. + // mul v27.4S, v23.4S, v1.S[2] // .................*................................................................. + // add v23.4S, v22.4S, v20.4S // ...............*................................................................... + // sub v10.4S, v22.4S, v20.4S // ..............*.................................................................... + // sqrdmulh v20.4S, v19.4S, v3.S[1] // .....................*............................................................. + // sub v6.4S, v4.4S, v15.4S // ....................*.............................................................. + // mul v13.4S, v28.4S, v2.S[0] // ..................*................................................................ + // sqrdmulh v7.4S, v28.4S, v2.S[1] // ...................*............................................................... + // mul v28.4S, v10.4S, v2.S[2] // .......................*........................................................... + // mul v22.4S, v19.4S, v3.S[0] // ......................*............................................................ + // sqrdmulh v10.4S, v10.4S, v2.S[3] // ........................*.......................................................... + // add v16.4S, v23.4S, v17.4S // .........................*......................................................... + // add v4.4S, v4.4S, v15.4S // ..........................*........................................................ + // mul v15.4S, v6.4S, v0.S[2] // ............................*...................................................... + // sqrdmulh v6.4S, v6.4S, v0.S[3] // .............................*..................................................... + // sub v23.4S, v23.4S, v17.4S // ...........................*....................................................... + // mls v27.4S, v21.4S, v8.S[0] // ...............................*................................................... + // mls v13.4S, v7.4S, v8.S[0] // ..............................*.................................................... + // mls v22.4S, v20.4S, v8.S[0] // ................................*.................................................. + // sqrdmulh v20.4S, v23.4S, v1.S[1] // ..................................*................................................ + // mul v17.4S, v23.4S, v1.S[0] // ...................................*............................................... + // mls v28.4S, v10.4S, v8.S[0] // .................................*................................................. + // mls v15.4S, v6.4S, v8.S[0] // ....................................*.............................................. + // add v11.4S, v4.4S, v16.4S // .....................................*............................................. + // add v10.4S, v27.4S, v13.4S // .......................................*........................................... + // sub v27.4S, v27.4S, v13.4S // ......................................*............................................ + // sub v13.4S, v28.4S, v22.4S // .........................................*......................................... + // add v19.4S, v28.4S, v22.4S // ........................................*.......................................... + // sqrdmulh v23.4S, v11.4S, v26.4S // ...........................................*....................................... + // mls v17.4S, v20.4S, v8.S[0] // ..........................................*........................................ + // mul v12.4S, v11.4S, v25.4S // ............................................*...................................... + // sqrdmulh v28.4S, v27.4S, v0.S[3] // .............................................*..................................... + // mul v22.4S, v27.4S, v0.S[2] // ..............................................*.................................... + // mul v20.4S, v13.4S, v1.S[0] // .................................................*................................. + // sqrdmulh v6.4S, v13.4S, v1.S[1] // ..................................................*................................ + // add v13.4S, v10.4S, v19.4S // ...............................................*................................... + // sub v19.4S, v10.4S, v19.4S // ................................................*.................................. + // add v5.4S, v15.4S, v17.4S // ...................................................*............................... + // sub v21.4S, v15.4S, v17.4S // ....................................................*.............................. + // sqrdmulh v17.4S, v19.4S, v0.S[1] // ........................................................*.......................... + // mul v15.4S, v13.4S, v25.4S // ......................................................*............................ + // mls v22.4S, v28.4S, v8.S[0] // .....................................................*............................. + // sqrdmulh v28.4S, v13.4S, v26.4S // .......................................................*........................... + // mul v19.4S, v19.4S, v0.S[0] // ..........................................................*........................ + // mul v13.4S, v5.4S, v25.4S // ............................................................*...................... + // mls v20.4S, v6.4S, v8.S[0] // ...........................................................*....................... + // mls v12.4S, v23.4S, v8.S[0] // .........................................................*......................... + // sqrdmulh v10.4S, v5.4S, v26.4S // .............................................................*..................... + // sqrdmulh v23.4S, v21.4S, v0.S[1] // ..............................................................*.................... + // mul v5.4S, v21.4S, v0.S[0] // ...............................................................*................... + // mls v15.4S, v28.4S, v8.S[0] // ................................................................*.................. + // sub v7.4S, v22.4S, v20.4S // .................................................................*................. + // add v6.4S, v22.4S, v20.4S // ..................................................................*................ + // mls v19.4S, v17.4S, v8.S[0] // ...................................................................*............... + // mls v5.4S, v23.4S, v8.S[0] // ......................................................................*............ + // cmge v24.4S, v31.4S, v12.4S // ....................................................................*.............. + // mls v13.4S, v10.4S, v8.S[0] // .....................................................................*............. + // sqrdmulh v10.4S, v6.4S, v26.4S // .........................................................................*......... + // mul v9.4S, v7.4S, v0.S[0] // ........................................................................*.......... + // sqrdmulh v20.4S, v7.4S, v0.S[1] // .......................................................................*........... + // mul v18.4S, v6.4S, v25.4S // ..........................................................................*........ + // cmge v11.4S, v31.4S, v15.4S // ...........................................................................*....... + // cmge v17.4S, v19.4S, v30.4S // ............................................................................*...... + // cmge v27.4S, v15.4S, v30.4S // .............................................................................*..... + // cmge v23.4S, v31.4S, v19.4S // ..............................................................................*.... + // cmge v6.4S, v5.4S, v30.4S // ...............................................................................*... + // cmge v7.4S, v13.4S, v30.4S // ................................................................................*.. + // cmge v28.4S, v31.4S, v5.4S // .................................................................................*. + // cmge v22.4S, v31.4S, v13.4S // ..................................................................................* sub count, count, #1 layer123_start: - sqrdmulh v13.4S, v21.4S, v26.4S // ....................................................................................*........... - // gap // ................................................................................................ - ldr q23, [x0, #656] // .....e.......................................................................................... - ldr q22, [x0, #400] // ...e............................................................................................ - sqrdmulh v15.4S, v20.4S, v0.S[1] // ........................................................*....................................... - mul v7.4S, v21.4S, v25.4S // ...................................................................................*............ - mul v10.4S, v20.4S, v0.S[0] // .......................................................*........................................ - ldr q5, [x0, #272] // ..e............................................................................................. - mul v14.4S, v16.4S, v0.S[0] // ..................................................*............................................. - mls v6.4S, v30.4S, v8.S[0] // ...............................................*................................................ - ldr q20, [x0, #528] // ....e........................................................................................... - ldr q28, [x0, #912] // .......e........................................................................................ - // gap // ................................................................................................ - ldr q21, [x0, #784] // ......e......................................................................................... - mls v27.4S, v19.4S, v8.S[0] // .....................................*.......................................................... - sqrdmulh v4.4S, v16.4S, v0.S[1] // ...................................................*............................................ - ldr q9, [x0, #16] // e............................................................................................... - ldr q16, [x0, #144] // .e.............................................................................................. - mul v11.4S, v17.4S, v25.4S // ................................................................................*............... - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v31.4S, v18.4S, v25.4S // ......................................................................................*......... - sqrdmulh v30.4S, v12.4S, v0.S[1] // .............................................................*.................................. - sqrdmulh v24.4S, v18.4S, v26.4S // .......................................................................................*........ - mul v18.4S, v12.4S, v0.S[0] // ............................................................*................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v7.4S, v13.4S, v8.S[0] // .....................................................................................*.......... - sqrdmulh v17.4S, v17.4S, v26.4S // .................................................................................*.............. - mls v10.4S, v15.4S, v8.S[0] // .........................................................*...................................... - add v19.4S, v5.4S, v22.4S // ..............e................................................................................. - mls v14.4S, v4.4S, v8.S[0] // ....................................................*........................................... - sub v13.4S, v27.4S, v6.4S // ...............................................................*................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - add v29.4S, v27.4S, v6.4S // ................................................................*............................... - // gap // ................................................................................................ - sub v15.4S, v21.4S, v28.4S // .......................e........................................................................ - add v4.4S, v21.4S, v28.4S // ........................e....................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - add v6.4S, v20.4S, v23.4S // ...................e............................................................................ - sub v28.4S, v20.4S, v23.4S // ..................e............................................................................. - sub v23.4S, v9.4S, v16.4S // ........e....................................................................................... - add v12.4S, v9.4S, v16.4S // .........e...................................................................................... - sub v9.4S, v5.4S, v22.4S // .............e.................................................................................. - // gap // ................................................................................................ - mul v22.4S, v13.4S, v0.S[0] // .................................................................*.............................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sub v20.4S, v6.4S, v4.4S // ......................................e......................................................... - // gap // ................................................................................................ - mul v27.4S, v28.4S, v2.S[2] // ....................e........................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v16.4S, v13.4S, v0.S[1] // ..................................................................*............................. - mul v13.4S, v15.4S, v3.S[0] // .........................e...................................................................... - sqrdmulh v21.4S, v15.4S, v3.S[1] // ..........................e..................................................................... - sub v5.4S, v12.4S, v19.4S // ............................e................................................................... - sqrdmulh v15.4S, v28.4S, v2.S[3] // .....................e.......................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - str q7, [x0, #128] // .............................................................................................*.. - mls v18.4S, v30.4S, v8.S[0] // ..............................................................*................................. - sqrdmulh v28.4S, v23.4S, v1.S[3] // ...........e.................................................................................... - mul v30.4S, v9.4S, v2.S[0] // ...............e................................................................................ - mul v7.4S, v23.4S, v1.S[2] // ..........e..................................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v23.4S, v9.4S, v2.S[1] // ................e............................................................................... - sqrdmulh v9.4S, v20.4S, v1.S[1] // .........................................e...................................................... - mls v22.4S, v16.4S, v8.S[0] // ...................................................................*............................ - // gap // ................................................................................................ - srshr v16.4S, v14.4S, #23 // ....................................................................*........................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v31.4S, v24.4S, v8.S[0] // ........................................................................................*....... - mls v27.4S, v15.4S, v8.S[0] // ......................e......................................................................... - mul v24.4S, v20.4S, v1.S[0] // ........................................e....................................................... - mls v13.4S, v21.4S, v8.S[0] // ...........................e.................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - srshr v21.4S, v10.4S, #23 // ......................................................................*......................... - // gap // ................................................................................................ - // gap // ................................................................................................ - srshr v20.4S, v18.4S, #23 // ........................................................................*....................... - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v30.4S, v23.4S, v8.S[0] // .................e.............................................................................. - sqrdmulh v23.4S, v5.4S, v0.S[3] // ...............................e................................................................ - mls v7.4S, v28.4S, v8.S[0] // ............e................................................................................... - mul v15.4S, v5.4S, v0.S[2] // ..............................e................................................................. - srshr v5.4S, v22.4S, #23 // ..........................................................................*..................... - str q31, [x0, #256] // ..............................................................................................*. - mul v31.4S, v29.4S, v25.4S // .........................................................................................*...... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v29.4S, v29.4S, v26.4S // ..........................................................................................*..... - add v28.4S, v27.4S, v13.4S // ............................................e................................................... - mls v11.4S, v17.4S, v8.S[0] // ..................................................................................*............. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v10.4S, v21.4S, v8.4S // .......................................................................*........................ - sub v13.4S, v27.4S, v13.4S // ...........................................e.................................................... - // gap // ................................................................................................ - mls v24.4S, v9.4S, v8.S[0] // ..........................................e..................................................... - mls v14.4S, v16.4S, v8.4S // .....................................................................*.......................... - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v18.4S, v20.4S, v8.4S // .........................................................................*...................... - // gap // ................................................................................................ - sub v9.4S, v7.4S, v30.4S // .................................e.............................................................. - add v7.4S, v7.4S, v30.4S // ..................................e............................................................. - mls v31.4S, v29.4S, v8.S[0] // ...........................................................................................*.... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v22.4S, v5.4S, v8.4S // ...........................................................................*.................... - mls v15.4S, v23.4S, v8.S[0] // ................................e............................................................... - add v12.4S, v12.4S, v19.4S // .............................e.................................................................. - str q10, [x0, #640] // .............................................................................*.................. - // gap // ................................................................................................ - str q11, [x0], #(16) // ............................................................................................*... - add v5.4S, v6.4S, v4.4S // .......................................e........................................................ - // gap // ................................................................................................ - mul v6.4S, v13.4S, v1.S[0] // .............................................e.................................................. - sqrdmulh v19.4S, v9.4S, v0.S[3] // ....................................e........................................................... - str q14, [x0, #496] // ............................................................................*................... - sub v20.4S, v7.4S, v28.4S // .....................................................e.......................................... - mul v27.4S, v9.4S, v0.S[2] // ...................................e............................................................ - add v21.4S, v7.4S, v28.4S // ......................................................e......................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v30.4S, v13.4S, v1.S[1] // ..............................................e................................................. - str q18, [x0, #752] // ..............................................................................*................. - sub v16.4S, v12.4S, v5.4S // ................................................e............................................... - add v17.4S, v12.4S, v5.4S // .................................................e.............................................. - str q31, [x0, #368] // ...............................................................................................* - str q22, [x0, #880] // ...............................................................................*................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sub v12.4S, v15.4S, v24.4S // ..........................................................e..................................... - add v18.4S, v15.4S, v24.4S // ...........................................................e.................................... + sub v14.4S, v4.4S, v16.4S // ................................................*....................................................................... + ldr q16, [x0, #400] // ...e.................................................................................................................... + sub v4.4S, v23.4S, v17.4S // ..........................................................................*............................................. + ldr q17, [x0, #16] // e....................................................................................................................... + // gap // ........................................................................................................................ + sub v11.4S, v11.4S, v27.4S // ..........................................................................................................*............. + mls v18.4S, v10.4S, v8.S[0] // ...................................................................................................*.................... + ldr q23, [x0, #144] // .e...................................................................................................................... + ldr q27, [x0, #784] // ......e................................................................................................................. + sub v6.4S, v28.4S, v6.4S // ..............................................................................*......................................... + ldr q21, [x0, #272] // ..e..................................................................................................................... + sub v10.4S, v22.4S, v7.4S // ..............................................................................................................*......... + ldr q28, [x0, #912] // .......e................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v7.4S, v12.4S, v30.4S // .....................................................................................................*.................. + mls v9.4S, v20.4S, v8.S[0] // ...................................................................*.................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q20, [x0, #656] // .....e.................................................................................................................. + ldr q22, [x0, #528] // ....e................................................................................................................... + mls v19.4S, v4.4S, v29.4S // ...........................................................................*............................................ + mls v15.4S, v11.4S, v29.4S // ...........................................................................................................*............ + // gap // ........................................................................................................................ + mls v13.4S, v10.4S, v29.4S // ...............................................................................................................*........ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v11.4S, v14.4S, v0.S[1] // ...................................................*.................................................................... + mul v14.4S, v14.4S, v0.S[0] // ..................................................*..................................................................... + // gap // ........................................................................................................................ + sub v24.4S, v24.4S, v7.4S // ......................................................................................................*................. + add v4.4S, v17.4S, v23.4S // .........e.............................................................................................................. + mls v5.4S, v6.4S, v29.4S // ...............................................................................*........................................ + sub v23.4S, v17.4S, v23.4S // ........e............................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q15, [x0, #128] // .....................................................................................................................*.. + add v17.4S, v27.4S, v28.4S // ........................e............................................................................................... + str q19, [x0, #640] // .....................................................................................*.................................. + sub v19.4S, v27.4S, v28.4S // .......................e................................................................................................ + sub v28.4S, v21.4S, v16.4S // .............e.......................................................................................................... + add v15.4S, v21.4S, v16.4S // ..............e......................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v21.4S, v23.4S, v1.S[3] // ...........e............................................................................................................ + mul v27.4S, v23.4S, v1.S[2] // ..........e............................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v23.4S, v22.4S, v20.4S // ...................e.................................................................................................... + sub v10.4S, v22.4S, v20.4S // ..................e..................................................................................................... + sqrdmulh v20.4S, v19.4S, v3.S[1] // ..........................e............................................................................................. + sub v6.4S, v4.4S, v15.4S // ............................e........................................................................................... + str q13, [x0, #256] // ......................................................................................................................*. + mul v13.4S, v28.4S, v2.S[0] // ...............e........................................................................................................ + sqrdmulh v7.4S, v28.4S, v2.S[1] // ................e....................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v28.4S, v10.4S, v2.S[2] // ....................e................................................................................................... + mul v22.4S, v19.4S, v3.S[0] // .........................e.............................................................................................. + sqrdmulh v10.4S, v10.4S, v2.S[3] // .....................e.................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v16.4S, v23.4S, v17.4S // .......................................e................................................................................ + // gap // ........................................................................................................................ + add v4.4S, v4.4S, v15.4S // .............................e.......................................................................................... + mul v15.4S, v6.4S, v0.S[2] // ..............................e......................................................................................... + sqrdmulh v6.4S, v6.4S, v0.S[3] // ...............................e........................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v23.4S, v23.4S, v17.4S // ......................................e................................................................................. + mls v27.4S, v21.4S, v8.S[0] // ............e........................................................................................................... + // gap // ........................................................................................................................ + mls v14.4S, v11.4S, v8.S[0] // ....................................................*................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v21.4S, v31.4S, v9.4S // ................................................................................*....................................... + // gap // ........................................................................................................................ + mls v13.4S, v7.4S, v8.S[0] // .................e...................................................................................................... + mls v22.4S, v20.4S, v8.S[0] // ...........................e............................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v20.4S, v23.4S, v1.S[1] // .........................................e.............................................................................. + mul v17.4S, v23.4S, v1.S[0] // ........................................e............................................................................... + mls v28.4S, v10.4S, v8.S[0] // ......................e................................................................................................. + mls v12.4S, v24.4S, v29.4S // .......................................................................................................*................ + mls v15.4S, v6.4S, v8.S[0] // ................................e....................................................................................... + cmge v6.4S, v9.4S, v30.4S // .................................................................................*...................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v11.4S, v4.4S, v16.4S // .................................................e...................................................................... + // gap // ........................................................................................................................ + add v10.4S, v27.4S, v13.4S // ..................................e..................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v24.4S, v14.4S, v30.4S // .....................................................................*.................................................. + cmge v7.4S, v31.4S, v14.4S // ....................................................................*................................................... + sub v27.4S, v27.4S, v13.4S // .................................e...................................................................................... + sub v13.4S, v28.4S, v22.4S // ...........................................e............................................................................ + // gap // ........................................................................................................................ + add v19.4S, v28.4S, v22.4S // ............................................e........................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v23.4S, v11.4S, v26.4S // .........................................................................................e.............................. + mls v17.4S, v20.4S, v8.S[0] // ..........................................e............................................................................. + str q12, [x0], #(16) // ....................................................................................................................*... + mul v12.4S, v11.4S, v25.4S // ........................................................................................e............................... + sqrdmulh v28.4S, v27.4S, v0.S[3] // ....................................e................................................................................... + mul v22.4S, v27.4S, v0.S[2] // ...................................e.................................................................................... + sub v27.4S, v21.4S, v6.4S // ..................................................................................*..................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q5, [x0, #752] // ......................................................................................*................................. + mul v20.4S, v13.4S, v1.S[0] // .............................................e.......................................................................... + sqrdmulh v6.4S, v13.4S, v1.S[1] // ..............................................e......................................................................... + // gap // ........................................................................................................................ + add v13.4S, v10.4S, v19.4S // ......................................................e................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v19.4S, v10.4S, v19.4S // .....................................................e.................................................................. + cmge v10.4S, v18.4S, v30.4S // .................................................................................................................*...... + cmge v11.4S, v31.4S, v18.4S // ................................................................................................................*....... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v5.4S, v15.4S, v17.4S // ...........................................................e............................................................ + sub v21.4S, v15.4S, v17.4S // ..........................................................e............................................................. + sqrdmulh v17.4S, v19.4S, v0.S[1] // ........................................................e............................................................... + mul v15.4S, v13.4S, v25.4S // ...........................................................................................e............................ + mls v22.4S, v28.4S, v8.S[0] // .....................................e.................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v28.4S, v13.4S, v26.4S // ............................................................................................e........................... + // gap // ........................................................................................................................ + mul v19.4S, v19.4S, v0.S[0] // .......................................................e................................................................ + // gap // ........................................................................................................................ + sub v11.4S, v11.4S, v10.4S // ..................................................................................................................*..... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v13.4S, v5.4S, v25.4S // ..............................................................................................e......................... + mls v20.4S, v6.4S, v8.S[0] // ...............................................e........................................................................ + mls v12.4S, v23.4S, v8.S[0] // ..........................................................................................e............................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v10.4S, v5.4S, v26.4S // ...............................................................................................e........................ + sqrdmulh v23.4S, v21.4S, v0.S[1] // .............................................................e.......................................................... + mul v5.4S, v21.4S, v0.S[0] // ............................................................e........................................................... + mls v9.4S, v27.4S, v29.4S // ...................................................................................*.................................... + mls v15.4S, v28.4S, v8.S[0] // .............................................................................................e.......................... + sub v27.4S, v7.4S, v24.4S // ......................................................................*................................................. + mls v18.4S, v11.4S, v29.4S // ...................................................................................................................*.... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v7.4S, v22.4S, v20.4S // ...............................................................e........................................................ + add v6.4S, v22.4S, v20.4S // ................................................................e....................................................... + mls v19.4S, v17.4S, v8.S[0] // .........................................................e.............................................................. + mls v5.4S, v23.4S, v8.S[0] // ..............................................................e......................................................... + // gap // ........................................................................................................................ + mls v14.4S, v27.4S, v29.4S // .......................................................................*................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v24.4S, v31.4S, v12.4S // ....................................................................................................e................... + mls v13.4S, v10.4S, v8.S[0] // ................................................................................................e....................... + str q9, [x0, #880] // .......................................................................................*................................ + str q18, [x0, #368] // .......................................................................................................................* + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v10.4S, v6.4S, v26.4S // ..................................................................................................e..................... + mul v9.4S, v7.4S, v0.S[0] // .................................................................e...................................................... + sqrdmulh v20.4S, v7.4S, v0.S[1] // ..................................................................e..................................................... + mul v18.4S, v6.4S, v25.4S // .................................................................................................e...................... + // gap // ........................................................................................................................ + cmge v11.4S, v31.4S, v15.4S // ........................................................................................................e............... + // gap // ........................................................................................................................ + cmge v17.4S, v19.4S, v30.4S // .........................................................................e.............................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v27.4S, v15.4S, v30.4S // .........................................................................................................e.............. + cmge v23.4S, v31.4S, v19.4S // ........................................................................e............................................... + cmge v6.4S, v5.4S, v30.4S // .............................................................................e.......................................... + str q14, [x0, #496] // ....................................................................................*................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v7.4S, v13.4S, v30.4S // .............................................................................................................e.......... + cmge v28.4S, v31.4S, v5.4S // ............................................................................e........................................... + cmge v22.4S, v31.4S, v13.4S // ............................................................................................................e........... // original source code - // ldr q9, [x0, #0] // .............e.................................................................................|.............e............................................................................... - // ldr q10, [x0, #(1*(1024/8))] // ..............e................................................................................|..............e.............................................................................. - // ldr q11, [x0, #(2*(1024/8))] // .....e.........................................................................................|.....e....................................................................................... - // ldr q12, [x0, #(3*(1024/8))] // .e.............................................................................................|.e........................................................................................... - // ldr q13, [x0, #(4*(1024/8))] // ........e......................................................................................|........e.................................................................................... - // ldr q14, [x0, #(5*(1024/8))] // e..............................................................................................|e............................................................................................ - // ldr q15, [x0, #(6*(1024/8))] // ..........e....................................................................................|..........e.................................................................................. - // ldr q16, [x0, #(7*(1024/8))] // .........e.....................................................................................|.........e................................................................................... - // sub v24.4s, v9.4s, v10.4s // ...............................e...............................................................|...............................e............................................................. - // add v9.4s, v9.4s, v10.4s // ................................e..............................................................|................................e............................................................ - // mul v10.4s, v24.4s, v1.s[2] // ..............................................e................................................|..............................................e.............................................. - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ............................................e..................................................|............................................e................................................ - // mls v10.4s, v24.4s, v8.s[0] // ...........................................................e...................................|...........................................................e................................. - // sub v24.4s, v11.4s, v12.4s // .................................e.............................................................|.................................e........................................................... - // add v11.4s, v11.4s, v12.4s // .......................e.......................................................................|.......................e..................................................................... - // mul v12.4s, v24.4s, v2.s[0] // .............................................e.................................................|.............................................e............................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...............................................e...............................................|...............................................e............................................. - // mls v12.4s, v24.4s, v8.s[0] // .........................................................e.....................................|.........................................................e................................... - // sub v24.4s, v13.4s, v14.4s // ..............................e................................................................|..............................e.............................................................. - // add v13.4s, v13.4s, v14.4s // .............................e.................................................................|.............................e............................................................... - // mul v14.4s, v24.4s, v2.s[2] // ....................................e..........................................................|....................................e........................................................ - // sqrdmulh v24.4s, v24.4s, v2.s[3] // .........................................e.....................................................|.........................................e................................................... - // mls v14.4s, v24.4s, v8.s[0] // ....................................................e..........................................|....................................................e........................................ - // sub v24.4s, v15.4s, v16.4s // ...........................e...................................................................|...........................e................................................................. - // add v15.4s, v15.4s, v16.4s // ............................e..................................................................|............................e................................................................ - // mul v16.4s, v24.4s, v3.s[0] // ......................................e........................................................|......................................e...................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // .......................................e.......................................................|.......................................e..................................................... - // mls v16.4s, v24.4s, v8.s[0] // ......................................................e........................................|......................................................e...................................... - // sub v24.4s, v9.4s, v11.4s // ........................................e......................................................|........................................e.................................................... - // add v9.4s, v9.4s, v11.4s // .............................................................................e.................|.............................................................................e............... - // mul v11.4s, v24.4s, v0.s[2] // ............................................................e..................................|............................................................e................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..........................................................e....................................|..........................................................e.................................. - // mls v11.4s, v24.4s, v8.s[0] // ............................................................................e..................|............................................................................e................ - // sub v24.4s, v10.4s, v12.4s // ........................................................................e......................|........................................................................e.................... - // add v10.4s, v10.4s, v12.4s // .........................................................................e.....................|.........................................................................e................... - // mul v12.4s, v24.4s, v0.s[2] // .....................................................................................e.........|.....................................................................................e....... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................................................................e............|..................................................................................e.......... - // mls v12.4s, v24.4s, v8.s[0] // ...........*...................................................................................|...........*................................................................................. - // sub v24.4s, v13.4s, v15.4s // ...................................e...........................................................|...................................e......................................................... - // add v13.4s, v13.4s, v15.4s // ................................................................................e..............|................................................................................e............ - // mul v15.4s, v24.4s, v1.s[0] // .....................................................e.........................................|.....................................................e....................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................................................e..............................................|................................................e............................................ - // mls v15.4s, v24.4s, v8.s[0] // .....................................................................e.........................|.....................................................................e....................... - // sub v24.4s, v14.4s, v16.4s // ....................................................................e..........................|....................................................................e........................ - // add v14.4s, v14.4s, v16.4s // .................................................................e.............................|.................................................................e........................... - // mul v16.4s, v24.4s, v1.s[0] // .................................................................................e.............|.................................................................................e........... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .......................................................................................e.......|.......................................................................................e..... - // mls v16.4s, v24.4s, v8.s[0] // .......*.......................................................................................|.......*..................................................................................... - // sub v24.4s, v9.4s, v13.4s // .........................................................................................e.....|.........................................................................................e... - // add v9.4s, v9.4s, v13.4s // ..........................................................................................e....|..........................................................................................e.. - // mul v13.4s, v24.4s, v0.s[0] // ......*........................................................................................|......*...................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............*..................................................................................|............*................................................................................ - // mls v13.4s, v24.4s, v8.s[0] // ........................*......................................................................|........................*.................................................................... - // sub v24.4s, v10.4s, v14.4s // ....................................................................................e..........|....................................................................................e........ - // add v10.4s, v10.4s, v14.4s // ......................................................................................e........|......................................................................................e...... - // mul v14.4s, v24.4s, v0.s[0] // ....*..........................................................................................|....*........................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..*............................................................................................|..*.......................................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ......................*........................................................................|......................*...................................................................... - // sub v24.4s, v11.4s, v15.4s // .............................................................................................e.|............................................................................................. - // add v11.4s, v11.4s, v15.4s // ..............................................................................................e|............................................................................................. - // mul v15.4s, v24.4s, v0.s[0] // ...................*...........................................................................|...................*......................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................*.............................................................................|.................*........................................................................... - // mls v15.4s, v24.4s, v8.s[0] // ...........................................*...................................................|...........................................*................................................. - // sub v24.4s, v12.4s, v16.4s // .........................*.....................................................................|.........................*................................................................... - // add v12.4s, v12.4s, v16.4s // ..........................*....................................................................|..........................*.................................................................. - // mul v16.4s, v24.4s, v0.s[0] // ..................................*............................................................|..................................*.......................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................*.........................................................|.....................................*....................................................... - // mls v16.4s, v24.4s, v8.s[0] // .................................................*.............................................|.................................................*........................................... - // srshr v24.4S, v13.4S, #23 // ..................................................*............................................|..................................................*.......................................... - // mls v13.4s, v24.4s, v8.4s // ......................................................................*........................|......................................................................*...................... - // srshr v24.4S, v14.4S, #23 // .......................................................*.......................................|.......................................................*..................................... - // mls v14.4s, v24.4s, v8.4s // ...................................................................*...........................|...................................................................*......................... - // srshr v24.4S, v15.4S, #23 // ........................................................*......................................|........................................................*.................................... - // mls v15.4s, v24.4s, v8.4s // .......................................................................*.......................|.......................................................................*..................... - // srshr v24.4S, v16.4S, #23 // .............................................................*.................................|.............................................................*............................... - // mls v16.4s, v24.4s, v8.4s // ...........................................................................*...................|...........................................................................*................. - // str q13, [x0, #(4*(1024/8))] // ...................................................................................*...........|...................................................................................*......... - // str q14, [x0, #(5*(1024/8))] // ..............................................................................*................|..............................................................................*.............. - // str q15, [x0, #(6*(1024/8))] // ........................................................................................*......|........................................................................................*.... - // str q16, [x0, #(7*(1024/8))] // ............................................................................................*..|............................................................................................* - // mul v13.4s, v9.4s, v25.4s // ...............*...............................................................................|...............*............................................................................. - // sqrdmulh v9.4s, v9.4s, v26.4s // .....................*.........................................................................|.....................*....................................................................... - // mls v13.4s, v9.4s, v8.s[0] // ..................................................................*............................|..................................................................*.......................... - // mul v14.4s, v10.4s, v25.4s // ...*...........................................................................................|...*......................................................................................... - // sqrdmulh v10.4s, v10.4s, v26.4s // ...............................................................................................*............................................................................................. - // mls v14.4s, v10.4s, v8.s[0] // ....................*..........................................................................|....................*........................................................................ - // mul v15.4s, v11.4s, v25.4s // ................*..............................................................................|................*............................................................................ - // sqrdmulh v11.4s, v11.4s, v26.4s // ..................*............................................................................|..................*.......................................................................... - // mls v15.4s, v11.4s, v8.s[0] // ...................................................*...........................................|...................................................*......................................... - // mul v16.4s, v12.4s, v25.4s // ...............................................................*...............................|...............................................................*............................. - // sqrdmulh v12.4s, v12.4s, v26.4s // ................................................................*..............................|................................................................*............................ - // mls v16.4s, v12.4s, v8.s[0] // ..........................................................................*....................|..........................................................................*.................. - // str q13, [x0], #(16) // ...............................................................................*...............|...............................................................................*............. - // str q14, [x0, #(-16 + 1*(1024/8))] // ..........................................*....................................................|..........................................*.................................................. - // str q15, [x0, #(-16 + 2*(1024/8))] // ..............................................................*................................|..............................................................*.............................. - // str q16, [x0, #(-16 + 3*(1024/8))] // ...........................................................................................*...|...........................................................................................*. + // ldr q9, [x0, #0] // ..e....................................................................................................................|..e................................................................................................................. + // ldr q10, [x0, #(1*(1024/8))] // .....e.................................................................................................................|.....e.............................................................................................................. + // ldr q11, [x0, #(2*(1024/8))] // ........e..............................................................................................................|........e........................................................................................................... + // ldr q12, [x0, #(3*(1024/8))] // e......................................................................................................................|e................................................................................................................... + // ldr q13, [x0, #(4*(1024/8))] // ..............e........................................................................................................|..............e..................................................................................................... + // ldr q14, [x0, #(5*(1024/8))] // .............e.........................................................................................................|.............e...................................................................................................... + // ldr q15, [x0, #(6*(1024/8))] // ......e................................................................................................................|......e............................................................................................................. + // ldr q16, [x0, #(7*(1024/8))] // ..........e............................................................................................................|..........e......................................................................................................... + // sub v24.4s, v9.4s, v10.4s // .......................e...............................................................................................|.......................e............................................................................................ + // add v9.4s, v9.4s, v10.4s // .....................e.................................................................................................|.....................e.............................................................................................. + // mul v10.4s, v24.4s, v1.s[2] // ...............................e.......................................................................................|...............................e.................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..............................e........................................................................................|..............................e..................................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ...............................................e.......................................................................|...............................................e.................................................................... + // sub v24.4s, v11.4s, v12.4s // ............................e..........................................................................................|............................e....................................................................................... + // add v11.4s, v11.4s, v12.4s // .............................e.........................................................................................|.............................e...................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // .....................................e.................................................................................|.....................................e.............................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ......................................e................................................................................|......................................e............................................................................. + // mls v12.4s, v24.4s, v8.s[0] // ..................................................e....................................................................|..................................................e................................................................. + // sub v24.4s, v13.4s, v14.4s // .................................e.....................................................................................|.................................e.................................................................................. + // add v13.4s, v13.4s, v14.4s // ................................e......................................................................................|................................e................................................................................... + // mul v14.4s, v24.4s, v2.s[2] // .......................................e...............................................................................|.......................................e............................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // .........................................e.............................................................................|.........................................e.......................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ......................................................e................................................................|......................................................e............................................................. + // sub v24.4s, v15.4s, v16.4s // ...........................e...........................................................................................|...........................e........................................................................................ + // add v15.4s, v15.4s, v16.4s // .........................e.............................................................................................|.........................e.......................................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ........................................e..............................................................................|........................................e........................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ..................................e....................................................................................|..................................e................................................................................. + // mls v16.4s, v24.4s, v8.s[0] // ...................................................e...................................................................|...................................................e................................................................ + // sub v24.4s, v9.4s, v11.4s // ...................................e...................................................................................|...................................e................................................................................ + // add v9.4s, v9.4s, v11.4s // ...........................................e...........................................................................|...........................................e........................................................................ + // mul v11.4s, v24.4s, v0.s[2] // ............................................e..........................................................................|............................................e....................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................e.........................................................................|.............................................e...................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ........................................................e..............................................................|........................................................e........................................................... + // sub v24.4s, v10.4s, v12.4s // ..............................................................e........................................................|..............................................................e..................................................... + // add v10.4s, v10.4s, v12.4s // ...........................................................e...........................................................|...........................................................e........................................................ + // mul v12.4s, v24.4s, v0.s[2] // ......................................................................e................................................|......................................................................e............................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....................................................................e.................................................|.....................................................................e.............................................. + // mls v12.4s, v24.4s, v8.s[0] // ...................................................................................e...................................|...................................................................................e................................ + // sub v24.4s, v13.4s, v15.4s // ..............................................e........................................................................|..............................................e..................................................................... + // add v13.4s, v13.4s, v15.4s // ..........................................e............................................................................|..........................................e......................................................................... + // mul v15.4s, v24.4s, v1.s[0] // .....................................................e.................................................................|.....................................................e.............................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ....................................................e..................................................................|....................................................e............................................................... + // mls v15.4s, v24.4s, v8.s[0] // ..................................................................e....................................................|..................................................................e................................................. + // sub v24.4s, v14.4s, v16.4s // ...............................................................e.......................................................|...............................................................e.................................................... + // add v14.4s, v14.4s, v16.4s // ................................................................e......................................................|................................................................e................................................... + // mul v16.4s, v24.4s, v1.s[0] // .........................................................................e.............................................|.........................................................................e.......................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................................................................e............................................|..........................................................................e......................................... + // mls v16.4s, v24.4s, v8.s[0] // ........................................................................................e..............................|........................................................................................e........................... + // sub v24.4s, v9.4s, v13.4s // .......................................................................................................................*.................................................................................................................... + // add v9.4s, v9.4s, v13.4s // ..........................................................e............................................................|..........................................................e......................................................... + // mul v13.4s, v24.4s, v0.s[0] // ...................*...................................................................................................|...................*................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................*....................................................................................................|..................*................................................................................................. + // mls v13.4s, v24.4s, v8.s[0] // ................................................*......................................................................|................................................*................................................................... + // sub v24.4s, v10.4s, v14.4s // ............................................................................e..........................................|............................................................................e....................................... + // add v10.4s, v10.4s, v14.4s // ...........................................................................e...........................................|...........................................................................e........................................ + // mul v14.4s, v24.4s, v0.s[0] // .....................................................................................e.................................|.....................................................................................e.............................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................................................................e.....................................|.................................................................................e.................................. + // mls v14.4s, v24.4s, v8.s[0] // ...................................................................................................e...................|...................................................................................................e................ + // sub v24.4s, v11.4s, v15.4s // ................................................................................e......................................|................................................................................e................................... + // add v11.4s, v11.4s, v15.4s // ...............................................................................e.......................................|...............................................................................e.................................... + // mul v15.4s, v24.4s, v0.s[0] // ............................................................................................e..........................|............................................................................................e....................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................................................e...........................|...........................................................................................e........................ + // mls v15.4s, v24.4s, v8.s[0] // ....................................................................................................e..................|....................................................................................................e............... + // sub v24.4s, v12.4s, v16.4s // .................................................................................................e.....................|.................................................................................................e.................. + // add v12.4s, v12.4s, v16.4s // ..................................................................................................e....................|..................................................................................................e................. + // mul v16.4s, v24.4s, v0.s[0] // ...........................................................................................................e...........|...........................................................................................................e........ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................e..........|............................................................................................................e....... + // mls v16.4s, v24.4s, v8.s[0] // ............*..........................................................................................................|............*....................................................................................................... + // cmge v27.4s, v31.4s, v13.4s // .............................................................*.........................................................|.............................................................*...................................................... + // cmge v28.4s, v13.4s, v30.4s // ............................................................*..........................................................|............................................................*....................................................... + // sub v28.4s, v27.4s, v28.4s // ...............................................................................................*.......................|...............................................................................................*.................... + // mls v13.4s, v28.4s, v29.4s // .....................................................................................................*.................|.....................................................................................................*.............. + // cmge v27.4s, v31.4s, v14.4s // .................................................................................................................e.....|.................................................................................................................e.. + // cmge v28.4s, v14.4s, v30.4s // ...............................................................................................................e.......|...............................................................................................................e.... + // sub v28.4s, v27.4s, v28.4s // .*.....................................................................................................................|.*.................................................................................................................. + // mls v14.4s, v28.4s, v29.4s // ...............*.......................................................................................................|...............*.................................................................................................... + // cmge v27.4s, v31.4s, v15.4s // .....................................................................................................................e.|.................................................................................................................... + // cmge v28.4s, v15.4s, v30.4s // ..................................................................................................................e....|..................................................................................................................e. + // sub v28.4s, v27.4s, v28.4s // .......*...............................................................................................................|.......*............................................................................................................ + // mls v15.4s, v28.4s, v29.4s // ......................*................................................................................................|......................*............................................................................................. + // cmge v27.4s, v31.4s, v16.4s // .................................................*.....................................................................|.................................................*.................................................................. + // cmge v28.4s, v16.4s, v30.4s // .........................................................*.............................................................|.........................................................*.......................................................... + // sub v28.4s, v27.4s, v28.4s // .......................................................................*...............................................|.......................................................................*............................................ + // mls v16.4s, v28.4s, v29.4s // .............................................................................................*.........................|.............................................................................................*...................... + // str q13, [x0, #(4*(1024/8))] // ...................................................................................................................*...|...................................................................................................................* + // str q14, [x0, #(5*(1024/8))] // ..........................*............................................................................................|..........................*......................................................................................... + // str q15, [x0, #(6*(1024/8))] // ........................................................................*..............................................|........................................................................*........................................... + // str q16, [x0, #(7*(1024/8))] // ........................................................................................................*..............|........................................................................................................*........... + // mul v13.4s, v9.4s, v25.4s // ....................................................................e..................................................|....................................................................e............................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // .................................................................e.....................................................|.................................................................e.................................................. + // mls v13.4s, v9.4s, v8.s[0] // .........................................................................................e.............................|.........................................................................................e.......................... + // mul v14.4s, v10.4s, v25.4s // ..................................................................................e....................................|..................................................................................e................................. + // sqrdmulh v10.4s, v10.4s, v26.4s // ....................................................................................e..................................|....................................................................................e............................... + // mls v14.4s, v10.4s, v8.s[0] // ..............................................................................................e........................|..............................................................................................e..................... + // mul v15.4s, v11.4s, v25.4s // .......................................................................................e...............................|.......................................................................................e............................ + // sqrdmulh v11.4s, v11.4s, v26.4s // ..........................................................................................e............................|..........................................................................................e......................... + // mls v15.4s, v11.4s, v8.s[0] // .......................................................................................................e...............|.......................................................................................................e............ + // mul v16.4s, v12.4s, v25.4s // .............................................................................................................e.........|.............................................................................................................e...... + // sqrdmulh v12.4s, v12.4s, v26.4s // ..........................................................................................................e............|..........................................................................................................e......... + // mls v16.4s, v12.4s, v8.s[0] // ....*..................................................................................................................|....*............................................................................................................... + // cmge v27.4s, v31.4s, v13.4s // ......................................................................................................e................|......................................................................................................e............. + // cmge v28.4s, v13.4s, v30.4s // ...........*...........................................................................................................|...........*........................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ....................*..................................................................................................|....................*............................................................................................... + // mls v13.4s, v28.4s, v29.4s // .......................................................*...............................................................|.......................................................*............................................................ + // cmge v27.4s, v31.4s, v14.4s // ..............................................................................................................e........|..............................................................................................................e..... + // cmge v28.4s, v14.4s, v30.4s // ................................................................................................................e......|................................................................................................................e... + // sub v28.4s, v27.4s, v28.4s // ...*...................................................................................................................|...*................................................................................................................ + // mls v14.4s, v28.4s, v29.4s // ................*......................................................................................................|................*................................................................................................... + // cmge v27.4s, v31.4s, v15.4s // ......................................................................................................................e|.................................................................................................................... + // cmge v28.4s, v15.4s, v30.4s // ....................................................................................................................e..|.................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .........*.............................................................................................................|.........*.......................................................................................................... + // mls v15.4s, v28.4s, v29.4s // .................*.....................................................................................................|.................*.................................................................................................. + // cmge v27.4s, v31.4s, v16.4s // ..............................................................................*........................................|..............................................................................*..................................... + // cmge v28.4s, v16.4s, v30.4s // .............................................................................*.........................................|.............................................................................*...................................... + // sub v28.4s, v27.4s, v28.4s // ......................................................................................*................................|......................................................................................*............................. + // mls v16.4s, v28.4s, v29.4s // ................................................................................................*......................|................................................................................................*................... + // str q13, [x0], #(16) // ...................................................................*...................................................|...................................................................*................................................ + // str q14, [x0, #(-16 + 1*(1024/8))] // ........................*..............................................................................................|........................*........................................................................................... + // str q15, [x0, #(-16 + 2*(1024/8))] // ....................................*..................................................................................|....................................*............................................................................... + // str q16, [x0, #(-16 + 3*(1024/8))] // .........................................................................................................*.............|.........................................................................................................*.......... sub count, count, #1 cbnz count, layer123_start - sqrdmulh v9.4S, v18.4S, v26.4S // ...........*................................ - mls v6.4S, v30.4S, v8.S[0] // .....*...................................... - mls v27.4S, v19.4S, v8.S[0] // ......*..................................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - sqrdmulh v30.4S, v17.4S, v26.4S // ..............*............................. - // gap // ............................................ - mul v5.4S, v20.4S, v0.S[0] // ...*........................................ - sqrdmulh v31.4S, v20.4S, v0.S[1] // .*.......................................... - mul v11.4S, v16.4S, v0.S[0] // ....*....................................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - sqrdmulh v24.4S, v16.4S, v0.S[1] // .......*.................................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mul v23.4S, v18.4S, v25.4S // .........*.................................. - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - sub v16.4S, v27.4S, v6.4S // .................*.......................... - mul v29.4S, v12.4S, v0.S[0] // ............*............................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - add v13.4S, v27.4S, v6.4S // ..................*......................... - sqrdmulh v14.4S, v12.4S, v0.S[1] // ..........*................................. - mls v5.4S, v31.4S, v8.S[0] // ...............*............................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mul v27.4S, v16.4S, v0.S[0] // ...................*........................ - // gap // ............................................ - // gap // ............................................ - sqrdmulh v31.4S, v16.4S, v0.S[1] // ....................*....................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mul v12.4S, v13.4S, v25.4S // ..............................*............. - sqrdmulh v20.4S, v13.4S, v26.4S // ...............................*............ - mul v4.4S, v17.4S, v25.4S // ........*................................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mls v11.4S, v24.4S, v8.S[0] // ................*........................... - mls v29.4S, v14.4S, v8.S[0] // ......................*..................... - srshr v13.4S, v5.4S, #23 // ..........................*................. - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mls v27.4S, v31.4S, v8.S[0] // .......................*.................... - mul v31.4S, v21.4S, v25.4S // ..*......................................... - mls v23.4S, v9.4S, v8.S[0] // .........................*.................. - sqrdmulh v9.4S, v21.4S, v26.4S // *........................................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mls v4.4S, v30.4S, v8.S[0] // ................................*........... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mls v5.4S, v13.4S, v8.4S // .................................*.......... - srshr v13.4S, v11.4S, #23 // ........................*................... - srshr v30.4S, v29.4S, #23 // ...........................*................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mls v12.4S, v20.4S, v8.S[0] // ....................................*....... - mls v31.4S, v9.4S, v8.S[0] // .............*.............................. - str q23, [x0, #256] // .............................*.............. - srshr v6.4S, v27.4S, #23 // ............................*............... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - str q4, [x0], #(16) // .......................................*.... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mls v11.4S, v13.4S, v8.4S // ..................................*......... - mls v29.4S, v30.4S, v8.4S // ...................................*........ - str q5, [x0, #624] // ......................................*..... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mls v27.4S, v6.4S, v8.4S // .....................................*...... - str q31, [x0, #112] // .....................*...................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - str q12, [x0, #368] // ..........................................*. - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - str q11, [x0, #496] // ........................................*... - str q29, [x0, #752] // .........................................*.. - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - str q27, [x0, #880] // ...........................................* - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ + sub v16.4S, v4.4S, v16.4S // *.................................... + mls v18.4S, v10.4S, v8.S[0] // ...*................................. + cmge v14.4S, v12.4S, v30.4S // ......*.............................. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sub v17.4S, v23.4S, v17.4S // .*................................... + sub v10.4S, v28.4S, v6.4S // ....*................................ + sub v7.4S, v22.4S, v7.4S // .....*............................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sub v14.4S, v24.4S, v14.4S // .............*....................... + sqrdmulh v24.4S, v16.4S, v0.S[1] // ...........*......................... + mul v6.4S, v16.4S, v0.S[0] // ............*........................ + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v5.4S, v10.4S, v29.4S // ..............*...................... + sub v10.4S, v11.4S, v27.4S // ..*.................................. + mls v9.4S, v20.4S, v8.S[0] // .......*............................. + mls v19.4S, v17.4S, v29.4S // ........*............................ + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v12.4S, v14.4S, v29.4S // ....................*................ + mls v13.4S, v7.4S, v29.4S // ..........*.......................... + cmge v7.4S, v18.4S, v30.4S // ...........................*......... + cmge v14.4S, v31.4S, v18.4S // ............................*........ + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v15.4S, v10.4S, v29.4S // .........*........................... + mls v6.4S, v24.4S, v8.S[0] // ..................*.................. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + str q5, [x0, #768] // ..........................*.......... + str q19, [x0, #640] // ................*.................... + sub v14.4S, v14.4S, v7.4S // .............................*....... + cmge v7.4S, v31.4S, v9.4S // ...................*................. + cmge v19.4S, v9.4S, v30.4S // .....................*............... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + str q13, [x0, #256] // .................*................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v18.4S, v14.4S, v29.4S // ................................*.... + str q15, [x0, #128] // ...............*..................... + sub v10.4S, v7.4S, v19.4S // .........................*........... + cmge v7.4S, v6.4S, v30.4S // ......................*.............. + cmge v14.4S, v31.4S, v6.4S // .......................*............. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + str q12, [x0], #(16) // ........................*............ + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v9.4S, v10.4S, v29.4S // ..............................*...... + sub v10.4S, v14.4S, v7.4S // ...............................*..... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + str q18, [x0, #368] // ...................................*. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v6.4S, v10.4S, v29.4S // .................................*... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + str q9, [x0, #880] // ..................................*.. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + str q6, [x0, #496] // ....................................* + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... // original source code - // sqrdmulh v13.4S, v21.4S, v26.4S // .........................*.................. - // sqrdmulh v15.4S, v20.4S, v0.S[1] // .....*...................................... - // mul v7.4S, v21.4S, v25.4S // .......................*.................... - // mul v10.4S, v20.4S, v0.S[0] // ....*....................................... - // mul v14.4S, v16.4S, v0.S[0] // ......*..................................... - // mls v6.4S, v30.4S, v8.S[0] // .*.......................................... - // mls v27.4S, v19.4S, v8.S[0] // ..*......................................... - // sqrdmulh v4.4S, v16.4S, v0.S[1] // .......*.................................... - // mul v11.4S, v17.4S, v25.4S // ..................*......................... - // mul v31.4S, v18.4S, v25.4S // ........*................................... - // sqrdmulh v30.4S, v12.4S, v0.S[1] // ............*............................... - // sqrdmulh v24.4S, v18.4S, v26.4S // *........................................... - // mul v18.4S, v12.4S, v0.S[0] // ..........*................................. - // mls v7.4S, v13.4S, v8.S[0] // ...............................*............ - // sqrdmulh v17.4S, v17.4S, v26.4S // ...*........................................ - // mls v10.4S, v15.4S, v8.S[0] // .............*.............................. - // mls v14.4S, v4.4S, v8.S[0] // ...................*........................ - // sub v13.4S, v27.4S, v6.4S // .........*.................................. - // add v29.4S, v27.4S, v6.4S // ...........*................................ - // mul v22.4S, v13.4S, v0.S[0] // ..............*............................. - // sqrdmulh v16.4S, v13.4S, v0.S[1] // ...............*............................ - // str q7, [x0, #128] // .......................................*.... - // mls v18.4S, v30.4S, v8.S[0] // ....................*....................... - // mls v22.4S, v16.4S, v8.S[0] // ......................*..................... - // srshr v16.4S, v14.4S, #23 // ............................*............... - // mls v31.4S, v24.4S, v8.S[0] // ........................*................... - // srshr v21.4S, v10.4S, #23 // .....................*...................... - // srshr v20.4S, v18.4S, #23 // .............................*.............. - // srshr v5.4S, v22.4S, #23 // .................................*.......... - // str q31, [x0, #256] // ................................*........... - // mul v31.4S, v29.4S, v25.4S // ................*........................... - // sqrdmulh v29.4S, v29.4S, v26.4S // .................*.......................... - // mls v11.4S, v17.4S, v8.S[0] // ..........................*................. - // mls v10.4S, v21.4S, v8.4S // ...........................*................ - // mls v14.4S, v16.4S, v8.4S // ...................................*........ - // mls v18.4S, v20.4S, v8.4S // ....................................*....... - // mls v31.4S, v29.4S, v8.S[0] // ..............................*............. - // mls v22.4S, v5.4S, v8.4S // ......................................*..... - // str q10, [x0, #640] // .....................................*...... - // str q11, [x0], #(16) // ..................................*......... - // str q14, [x0, #496] // .........................................*.. - // str q18, [x0, #752] // ..........................................*. - // str q31, [x0, #368] // ........................................*... - // str q22, [x0, #880] // ...........................................* + // sub v14.4S, v4.4S, v16.4S // *.................................... + // sub v4.4S, v23.4S, v17.4S // ...*................................. + // sub v11.4S, v11.4S, v27.4S // ..........*.......................... + // mls v18.4S, v10.4S, v8.S[0] // .*................................... + // sub v6.4S, v28.4S, v6.4S // ....*................................ + // sub v10.4S, v22.4S, v7.4S // .....*............................... + // cmge v7.4S, v12.4S, v30.4S // ..*.................................. + // mls v9.4S, v20.4S, v8.S[0] // ...........*......................... + // mls v19.4S, v4.4S, v29.4S // ............*........................ + // mls v15.4S, v11.4S, v29.4S // .................*................... + // mls v13.4S, v10.4S, v29.4S // ..............*...................... + // sqrdmulh v11.4S, v14.4S, v0.S[1] // .......*............................. + // mul v14.4S, v14.4S, v0.S[0] // ........*............................ + // sub v24.4S, v24.4S, v7.4S // ......*.............................. + // mls v5.4S, v6.4S, v29.4S // .........*........................... + // str q15, [x0, #128] // ..........................*.......... + // str q19, [x0, #640] // ....................*................ + // str q13, [x0, #256] // ........................*............ + // mls v14.4S, v11.4S, v8.S[0] // ..................*.................. + // cmge v21.4S, v31.4S, v9.4S // ......................*.............. + // mls v12.4S, v24.4S, v29.4S // .............*....................... + // cmge v6.4S, v9.4S, v30.4S // .......................*............. + // cmge v24.4S, v14.4S, v30.4S // ............................*........ + // cmge v7.4S, v31.4S, v14.4S // .............................*....... + // str q12, [x0], #(16) // ..............................*...... + // sub v27.4S, v21.4S, v6.4S // ...........................*......... + // str q5, [x0, #752] // ...................*................. + // cmge v10.4S, v18.4S, v30.4S // ...............*..................... + // cmge v11.4S, v31.4S, v18.4S // ................*.................... + // sub v11.4S, v11.4S, v10.4S // .....................*............... + // mls v9.4S, v27.4S, v29.4S // ...............................*..... + // sub v27.4S, v7.4S, v24.4S // ................................*.... + // mls v18.4S, v11.4S, v29.4S // .........................*........... + // mls v14.4S, v27.4S, v29.4S // ..................................*.. + // str q9, [x0, #880] // ...................................*. + // str q18, [x0, #368] // .................................*... + // str q14, [x0, #496] // ....................................* pop_stack diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s new file mode 100644 index 0000000..58ae551 --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s @@ -0,0 +1,2091 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm + .global _intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm: +_intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + ldr q28, [x1, #16] // ...*............................................................................................................................................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + ldr q22, [x1, #0] // ..*.............................................................................................................................................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + ldr q23, [x1, #32] // .*............................................................................................................................................................... + ldr q14, [x1, #48] // *................................................................................................................................................................ + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + ldr q27, [x2, #0] // .....................*........................................................................................................................................... + ldr q29, [x2, #16] // ...................*............................................................................................................................................. + ldr q16, [x5, #128] // ......*.......................................................................................................................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + ldr q11, [x5, #160] // ...........*..................................................................................................................................................... + ldr q2, [x5, #144] // ..............*.................................................................................................................................................. + ldr q30, [x2, #48] // ................*................................................................................................................................................ + trn2 v18.4S, v22.4S, v28.4S // .............*................................................................................................................................................... + trn1 v10.4S, v22.4S, v28.4S // ............*.................................................................................................................................................... + // gap // ................................................................................................................................................................. + trn1 v26.4S, v23.4S, v14.4S // ..........*...................................................................................................................................................... + ldr q13, [x2, #32] // ...............*................................................................................................................................................. + trn2 v12.4S, v23.4S, v14.4S // ........*........................................................................................................................................................ + // gap // ................................................................................................................................................................. + trn2 v20.4S, v27.4S, v29.4S // ...................................*............................................................................................................................. + trn1 v15.4S, v27.4S, v29.4S // ....................................*............................................................................................................................ + ldr q17, [x5, #32] // ...............................*................................................................................................................................. + trn2 v29.2D, v10.2D, v26.2D // ..................*.............................................................................................................................................. + ldr q7, [x5, #80] // .....*........................................................................................................................................................... + trn2 v21.2D, v18.2D, v12.2D // .................*............................................................................................................................................... + ldr q31, [x5, #64] // .............................*................................................................................................................................... + ldr q27, [x4, #16] // ........................................................*........................................................................................................ + ldr q5, [x5, #48] // .......................*......................................................................................................................................... + trn1 v14.2D, v18.2D, v12.2D // ....................*............................................................................................................................................ + trn1 v25.2D, v10.2D, v26.2D // ......................*.......................................................................................................................................... + trn2 v26.4S, v13.4S, v30.4S // ..............................*.................................................................................................................................. + sub v4.4S, v29.4S, v21.4S // ........................*........................................................................................................................................ + ldr q28, [x5, #112] // .......*......................................................................................................................................................... + ldr q12, [x5, #96] // ..........................*...................................................................................................................................... + ldr q6, [x5], #(12*16) // ........................................*........................................................................................................................ + // gap // ................................................................................................................................................................. + add v29.4S, v29.4S, v21.4S // .........................*....................................................................................................................................... + sub v23.4S, v25.4S, v14.4S // ...........................*..................................................................................................................................... + mul v9.4S, v4.4S, v31.4S // .......................................*......................................................................................................................... + ldr q3, [x4], #64 // .................................*............................................................................................................................... + // gap // ................................................................................................................................................................. + add v1.4S, v25.4S, v14.4S // ............................*.................................................................................................................................... + sqrdmulh v24.4S, v4.4S, v7.4S // ............................................*.................................................................................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + trn1 v13.4S, v13.4S, v30.4S // ................................*................................................................................................................................ + mul v10.4S, v23.4S, v17.4S // ..........................................*...................................................................................................................... + sqrdmulh v30.4S, v23.4S, v5.4S // .....................................*........................................................................................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + trn1 v23.2D, v15.2D, v13.2D // .........................................*....................................................................................................................... + ldr q31, [x5, #-16] // ....*............................................................................................................................................................ + ldr q22, [x5, #-176] // .........*....................................................................................................................................................... + trn1 v7.2D, v20.2D, v26.2D // ......................................*.......................................................................................................................... + mls v9.4S, v24.4S, v8.S[0] // .................................................*............................................................................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + trn2 v14.2D, v15.2D, v13.2D // .............................................*................................................................................................................... + mls v10.4S, v30.4S, v8.S[0] // ................................................*................................................................................................................ + trn2 v25.2D, v20.2D, v26.2D // ...........................................*..................................................................................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + sub v18.4S, v1.4S, v29.4S // ...............................................*................................................................................................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + sub v30.4S, v23.4S, v7.4S // ..............................................*.................................................................................................................. + add v26.4S, v14.4S, v25.4S // ...................................................................*............................................................................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + add v24.4S, v23.4S, v7.4S // .....................................................*........................................................................................................... + sub v21.4S, v10.4S, v9.4S // .........................................................*....................................................................................................... + mul v5.4S, v30.4S, v16.4S // ......................................................*.......................................................................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + sub v23.4S, v24.4S, v26.4S // .........................................................................*....................................................................................... + mul v17.4S, v18.4S, v6.4S // ...................................................*............................................................................................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mul v0.4S, v21.4S, v6.4S // ................................................................*................................................................................................ + sqrdmulh v20.4S, v21.4S, v22.4S // ...............................................................*................................................................................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + add v21.4S, v10.4S, v9.4S // .............................................................*................................................................................................... + sqrdmulh v7.4S, v18.4S, v22.4S // ..........................................................*...................................................................................................... + sqrdmulh v4.4S, v23.4S, v28.4S // .............................................................................*................................................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + sqrdmulh v22.4S, v30.4S, v2.4S // ..................................................*.............................................................................................................. + mls v0.4S, v20.4S, v8.S[0] // ......................................................................*.......................................................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + sub v19.4S, v14.4S, v25.4S // ....................................................*............................................................................................................ + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mls v17.4S, v7.4S, v8.S[0] // .....................................................................*........................................................................................... + add v2.4S, v1.4S, v29.4S // ..................................*.............................................................................................................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mul v29.4S, v19.4S, v11.4S // .......................................................*......................................................................................................... + sqrdmulh v16.4S, v19.4S, v31.4S // ...........................................................*..................................................................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mls v5.4S, v22.4S, v8.S[0] // ............................................................*.................................................................................................... + add v31.4S, v24.4S, v26.4S // ..................................................................................*.............................................................................. + trn1 v24.4S, v17.4S, v0.4S // ...............................................................................*................................................................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + trn1 v10.4S, v2.4S, v21.4S // ..................................................................*.............................................................................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + trn2 v11.4S, v2.4S, v21.4S // .......................................................................*......................................................................................... + trn2 v14.4S, v17.4S, v0.4S // ..............................................................................*.................................................................................. + // gap // ................................................................................................................................................................. + ldr q9, [x4, #-32] // ....................................................................*............................................................................................ + trn2 v25.2D, v10.2D, v24.2D // ...................................................................................*............................................................................. + trn1 v30.2D, v10.2D, v24.2D // .........................................................................................*....................................................................... + // gap // ................................................................................................................................................................. + trn1 v6.2D, v11.2D, v14.2D // .....................................................................................*........................................................................... + // gap // ................................................................................................................................................................. + trn2 v7.2D, v11.2D, v14.2D // .................................................................................*............................................................................... + // gap // ................................................................................................................................................................. + mul v20.4S, v23.4S, v12.4S // ................................................................................*................................................................................ + // gap // ................................................................................................................................................................. + mls v29.4S, v16.4S, v8.S[0] // .................................................................*............................................................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + add v14.4S, v25.4S, v7.4S // ........................................................................................*........................................................................ + add v10.4S, v30.4S, v6.4S // ..............................................................................................*.................................................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + sub v24.4S, v25.4S, v7.4S // .......................................................................................*......................................................................... + sub v26.4S, v30.4S, v6.4S // ................................................................................................*................................................................ + add v13.4S, v10.4S, v14.4S // ...................................................................................................*............................................................. + sub v21.4S, v10.4S, v14.4S // ........................................................................................................................*........................................ + // gap // ................................................................................................................................................................. + ldr q6, [x4, #-16] // ..............................................................*.................................................................................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + sqrdmulh v17.4S, v26.4S, v27.S[3] // .........................................................................................................*....................................................... + sub v14.4S, v5.4S, v29.4S // ........................................................................*........................................................................................ + srshr v7.4S, v13.4S, #23 // .......................................................................................................................*......................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mul v18.4S, v26.4S, v27.S[2] // .......................................................................................................*......................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mul v10.4S, v14.4S, v12.4S // ............................................................................*.................................................................................... + sqrdmulh v14.4S, v14.4S, v28.4S // ...........................................................................*..................................................................................... + sqrdmulh v1.4S, v24.4S, v9.S[1] // .................................................................................................*............................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + add v30.4S, v5.4S, v29.4S // ..........................................................................*...................................................................................... + mul v12.4S, v24.4S, v9.S[0] // ............................................................................................*.................................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mls v18.4S, v17.4S, v8.S[0] // .................................................................................................................*............................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mls v10.4S, v14.4S, v8.S[0] // ....................................................................................*............................................................................ + mls v20.4S, v4.4S, v8.S[0] // ......................................................................................*.......................................................................... + mls v13.4S, v7.4S, v8.4S // .............................................................................................................................*................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + trn1 v26.4S, v31.4S, v30.4S // ...........................................................................................*..................................................................... + mls v12.4S, v1.4S, v8.S[0] // ...................................................................................................................*............................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + trn2 v30.4S, v31.4S, v30.4S // ..........................................................................................*...................................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + trn2 v14.4S, v20.4S, v10.4S // ...............................................................................................*................................................................. + trn1 v10.4S, v20.4S, v10.4S // .............................................................................................*................................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mul v19.4S, v21.4S, v3.S[2] // ............................................................................................................................*.................................... + sqrdmulh v29.4S, v21.4S, v3.S[3] // ...................................................................................................................................*............................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + sub v11.4S, v18.4S, v12.4S // .........................................................................................................................*....................................... + trn2 v20.2D, v30.2D, v14.2D // ......................................................................................................*.......................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + trn1 v14.2D, v30.2D, v14.2D // ....................................................................................................*............................................................ + trn1 v2.2D, v26.2D, v10.2D // ..................................................................................................*.............................................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + sqrdmulh v24.4S, v11.4S, v3.S[3] // ....................................................................................................................................*............................ + trn2 v15.2D, v26.2D, v10.2D // .....................................................................................................*........................................................... + add v23.4S, v2.4S, v14.4S // ..............................................................................................................*.................................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + sub v26.4S, v2.4S, v14.4S // ........................................................................................................*........................................................ + add v30.4S, v15.4S, v20.4S // ..........................................................................................................*...................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + sub v7.4S, v15.4S, v20.4S // ...........................................................................................................*..................................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + add v15.4S, v18.4S, v12.4S // ..................................................................................................................................*.............................. + mul v10.4S, v26.4S, v9.S[2] // .............................................................................................................*................................................... + add v5.4S, v23.4S, v30.4S // ..................................................................................................................*.............................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mul v25.4S, v7.4S, v6.S[0] // ................................................................................................................*................................................ + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + sqrdmulh v14.4S, v7.4S, v6.S[1] // ...............................................................................................................*................................................. + sqrdmulh v26.4S, v26.4S, v9.S[3] // ............................................................................................................*.................................................... + sub v18.4S, v23.4S, v30.4S // .....................................................................................................................*........................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + srshr v1.4S, v5.4S, #23 // ..........................................................................................................................*...................................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + srshr v31.4S, v15.4S, #23 // .......................................................................................................................................*......................... + mls v19.4S, v29.4S, v8.S[0] // .................................................................................................................................................*............... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mls v25.4S, v14.4S, v8.S[0] // ......................................................................................................................*.......................................... + mls v10.4S, v26.4S, v8.S[0] // ....................................................................................................................*............................................ + sqrdmulh v7.4S, v18.4S, v27.S[1] // ........................................................................................................................................*........................ + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mls v5.4S, v1.4S, v8.4S // ...............................................................................................................................*................................. + mul v30.4S, v18.4S, v27.S[0] // ...........................................................................................................................................*..................... + // gap // ................................................................................................................................................................. + mul v28.4S, v11.4S, v3.S[2] // ................................................................................................................................*................................ + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mls v15.4S, v31.4S, v8.4S // .............................................................................................................................................*................... + // gap // ................................................................................................................................................................. + add v2.4S, v10.4S, v25.4S // ..............................................................................................................................*.................................. + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + sub v29.4S, v10.4S, v25.4S // ...........................................................................................................................*..................................... + sub v0.4S, v13.4S, v5.4S // ......................................................................................................................................*.......................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mls v30.4S, v7.4S, v8.S[0] // ..................................................................................................................................................*.............. + srshr v10.4S, v2.4S, #23 // .....................................................................................................................................*........................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mul v11.4S, v0.4S, v3.S[0] // .........................................................................................................................................*....................... + sqrdmulh v7.4S, v0.4S, v3.S[1] // ..........................................................................................................................................*...................... + sqrdmulh v14.4S, v29.4S, v27.S[1] // .................................................................................................................................*............................... + // gap // ................................................................................................................................................................. + mul v23.4S, v29.4S, v27.S[0] // ..............................................................................................................................................*.................. + // gap // ................................................................................................................................................................. + sub v26.4S, v19.4S, v30.4S // .........................................................................................................................................................*....... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mls v2.4S, v10.4S, v8.4S // ............................................................................................................................................*.................... + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + mls v11.4S, v7.4S, v8.S[0] // ...............................................................................................................................................*................. + add v10.4S, v19.4S, v30.4S // ..........................................................................................................................................................*...... + // gap // ................................................................................................................................................................. + mls v23.4S, v14.4S, v8.S[0] // ...................................................................................................................................................*............. + mls v28.4S, v24.4S, v8.S[0] // ................................................................................................................................................*................ + // gap // ................................................................................................................................................................. + str q10, [x1, #32] // ................................................................................................................................................................* + // gap // ................................................................................................................................................................. + add v29.4S, v15.4S, v2.4S // ....................................................................................................................................................*............ + // gap // ................................................................................................................................................................. + // gap // ................................................................................................................................................................. + str q11, [x2], #(16*4) // ......................................................................................................................................................*.......... + mul v9.4S, v26.4S, v3.S[0] // ..............................................................................................................................................................*.. + add v31.4S, v13.4S, v5.4S // .....................................................................................................................................................*........... + add v14.4S, v28.4S, v23.4S // ............................................................................................................................................................*.... + // gap // ................................................................................................................................................................. + sub v20.4S, v15.4S, v2.4S // .......................................................................................................................................................*......... + str q29, [x1, #16] // ........................................................................................................................................................*........ + str q31, [x1], #(16*4) // ...........................................................................................................................................................*..... + // gap // ................................................................................................................................................................. + sqrdmulh v30.4S, v26.4S, v3.S[1] // ...............................................................................................................................................................*. + sub v17.4S, v28.4S, v23.4S // .............................................................................................................................................................*... + + // original source code + // ldr q18, [x1, #48] // ...*............................................................................................................................................................. + // ldr q23, [x1, #32] // ..*.............................................................................................................................................................. + // ldr q27, [x1, #0] // .*............................................................................................................................................................... + // ldr q14, [x1, #16] // *................................................................................................................................................................ + // ldr q4, [x5, #176] // .........................................*....................................................................................................................... + // ldr q2, [x5, #80] // ...................*............................................................................................................................................. + // ldr q26, [x5, #128] // ......*.......................................................................................................................................................... + // ldr q11, [x5, #112] // ............................*.................................................................................................................................... + // trn2 v13.4S, v23.4S, v18.4S // ..............*.................................................................................................................................................. + // ldr q17, [x5, #16] // ..........................................*...................................................................................................................... + // trn1 v30.4S, v23.4S, v18.4S // ............*.................................................................................................................................................... + // ldr q24, [x5, #160] // .......*......................................................................................................................................................... + // trn1 v20.4S, v27.4S, v14.4S // ...........*..................................................................................................................................................... + // trn2 v14.4S, v27.4S, v14.4S // ..........*...................................................................................................................................................... + // ldr q6, [x5, #144] // ........*........................................................................................................................................................ + // ldr q31, [x2, #32] // .............*................................................................................................................................................... + // ldr q22, [x2, #48] // .........*....................................................................................................................................................... + // trn2 v5.2D, v14.2D, v13.2D // ....................*............................................................................................................................................ + // trn2 v3.2D, v20.2D, v30.2D // ..................*.............................................................................................................................................. + // ldr q29, [x2, #16] // .....*........................................................................................................................................................... + // trn1 v18.2D, v14.2D, v13.2D // ........................*........................................................................................................................................ + // ldr q25, [x2, #0] // ....*............................................................................................................................................................ + // trn1 v20.2D, v20.2D, v30.2D // .........................*....................................................................................................................................... + // ldr q16, [x5, #48] // .......................*......................................................................................................................................... + // sub v1.4S, v3.4S, v5.4S // ...........................*..................................................................................................................................... + // add v23.4S, v3.4S, v5.4S // ...............................*................................................................................................................................. + // ldr q27, [x5, #96] // .............................*................................................................................................................................... + // sub v19.4S, v20.4S, v18.4S // ................................*................................................................................................................................ + // add v21.4S, v20.4S, v18.4S // ...................................*............................................................................................................................. + // ldr q10, [x5, #64] // .....................*........................................................................................................................................... + // trn2 v15.4S, v31.4S, v22.4S // ..........................*...................................................................................................................................... + // ldr q18, [x5, #32] // .................*............................................................................................................................................... + // trn1 v20.4S, v31.4S, v22.4S // .....................................*........................................................................................................................... + // ldr q3, [x4], #64 // ..................................*.............................................................................................................................. + // add v30.4S, v21.4S, v23.4S // .................................................................*............................................................................................... + // trn2 v9.4S, v25.4S, v29.4S // ...............*................................................................................................................................................. + // trn1 v12.4S, v25.4S, v29.4S // ................*................................................................................................................................................ + // sqrdmulh v0.4S, v19.4S, v16.4S // .......................................*......................................................................................................................... + // trn1 v28.2D, v9.2D, v15.2D // ...........................................*..................................................................................................................... + // mul v29.4S, v1.4S, v10.4S // .................................*............................................................................................................................... + // ldr q22, [x5], #(12*16) // ..............................*.................................................................................................................................. + // trn1 v5.2D, v12.2D, v20.2D // ........................................*........................................................................................................................ + // mul v14.4S, v19.4S, v18.4S // ......................................*.......................................................................................................................... + // trn2 v7.2D, v9.2D, v15.2D // ...............................................*................................................................................................................. + // sqrdmulh v16.4S, v1.4S, v2.4S // ....................................*............................................................................................................................ + // trn2 v13.2D, v12.2D, v20.2D // .............................................*................................................................................................................... + // sub v1.4S, v5.4S, v28.4S // .................................................*............................................................................................................... + // sub v12.4S, v21.4S, v23.4S // ................................................*................................................................................................................ + // mls v14.4S, v0.4S, v8.S[0] // ..............................................*.................................................................................................................. + // mls v29.4S, v16.4S, v8.S[0] // ............................................*.................................................................................................................... + // sqrdmulh v20.4S, v1.4S, v6.4S // .............................................................*................................................................................................... + // mul v19.4S, v12.4S, v22.4S // .......................................................*......................................................................................................... + // sub v10.4S, v13.4S, v7.4S // ...............................................................*................................................................................................. + // add v18.4S, v5.4S, v28.4S // ...................................................*............................................................................................................. + // mul v5.4S, v1.4S, v26.4S // .....................................................*........................................................................................................... + // mul v9.4S, v10.4S, v24.4S // ..................................................................*.............................................................................................. + // ldr q6, [x4, #-48] // ......................*.......................................................................................................................................... + // sub v15.4S, v14.4S, v29.4S // ....................................................*............................................................................................................ + // sqrdmulh v31.4S, v12.4S, v17.4S // ...........................................................*..................................................................................................... + // sqrdmulh v4.4S, v10.4S, v4.4S // ...................................................................*............................................................................................. + // mls v5.4S, v20.4S, v8.S[0] // ....................................................................*............................................................................................ + // add v21.4S, v14.4S, v29.4S // ..........................................................*...................................................................................................... + // ldr q23, [x4, #-16] // .......................................................................................*......................................................................... + // sqrdmulh v24.4S, v15.4S, v17.4S // .........................................................*....................................................................................................... + // mul v29.4S, v15.4S, v22.4S // ........................................................*........................................................................................................ + // mls v9.4S, v4.4S, v8.S[0] // ................................................................................*................................................................................ + // trn1 v14.4S, v30.4S, v21.4S // .......................................................................*......................................................................................... + // add v12.4S, v13.4S, v7.4S // ..................................................*.............................................................................................................. + // ldr q25, [x4, #-32] // ..........................................................................*...................................................................................... + // mls v19.4S, v31.4S, v8.S[0] // ................................................................*................................................................................................ + // mls v29.4S, v24.4S, v8.S[0] // ..............................................................*.................................................................................................. + // trn2 v0.4S, v30.4S, v21.4S // ........................................................................*........................................................................................ + // sub v2.4S, v5.4S, v9.4S // .........................................................................................*....................................................................... + // sub v30.4S, v18.4S, v12.4S // ......................................................*.......................................................................................................... + // add v22.4S, v5.4S, v9.4S // ...............................................................................................*................................................................. + // sqrdmulh v13.4S, v2.4S, v11.4S // .............................................................................................*................................................................... + // mul v5.4S, v2.4S, v27.4S // ............................................................................................*.................................................................... + // sqrdmulh v26.4S, v30.4S, v11.4S // ............................................................*.................................................................................................... + // trn2 v16.4S, v19.4S, v29.4S // .........................................................................*....................................................................................... + // trn1 v20.4S, v19.4S, v29.4S // ......................................................................*.......................................................................................... + // mul v2.4S, v30.4S, v27.4S // ...............................................................................*................................................................................. + // trn2 v28.2D, v0.2D, v16.2D // ..............................................................................*.................................................................................. + // add v19.4S, v18.4S, v12.4S // .....................................................................*........................................................................................... + // trn2 v9.2D, v14.2D, v20.2D // ...........................................................................*..................................................................................... + // mls v5.4S, v13.4S, v8.S[0] // ..................................................................................................*.............................................................. + // trn1 v29.2D, v0.2D, v16.2D // .............................................................................*................................................................................... + // mls v2.4S, v26.4S, v8.S[0] // ...................................................................................................*............................................................. + // sub v27.4S, v9.4S, v28.4S // ...................................................................................*............................................................................. + // add v26.4S, v9.4S, v28.4S // .................................................................................*............................................................................... + // trn1 v21.2D, v14.2D, v20.2D // ............................................................................*.................................................................................... + // trn2 v10.4S, v19.4S, v22.4S // .......................................................................................................*......................................................... + // trn1 v14.4S, v19.4S, v22.4S // .....................................................................................................*........................................................... + // mul v9.4S, v27.4S, v25.S[0] // ................................................................................................*................................................................ + // trn1 v22.4S, v2.4S, v5.4S // .........................................................................................................*....................................................... + // add v16.4S, v21.4S, v29.4S // ..................................................................................*.............................................................................. + // trn2 v28.4S, v2.4S, v5.4S // ........................................................................................................*........................................................ + // sub v15.4S, v21.4S, v29.4S // ....................................................................................*............................................................................ + // sqrdmulh v21.4S, v27.4S, v25.S[1] // ..............................................................................................*.................................................................. + // trn1 v20.2D, v14.2D, v22.2D // ...............................................................................................................*................................................. + // add v17.4S, v16.4S, v26.4S // .....................................................................................*........................................................................... + // trn1 v2.2D, v10.2D, v28.2D // ..............................................................................................................*.................................................. + // trn2 v22.2D, v14.2D, v22.2D // .................................................................................................................*............................................... + // trn2 v14.2D, v10.2D, v28.2D // .............................................................................................................*................................................... + // mul v29.4S, v15.4S, v6.S[2] // ...........................................................................................*..................................................................... + // sub v13.4S, v20.4S, v2.4S // ...................................................................................................................*............................................. + // sqrdmulh v11.4S, v15.4S, v6.S[3] // ........................................................................................*........................................................................ + // add v4.4S, v22.4S, v14.4S // ....................................................................................................................*............................................ + // sub v5.4S, v22.4S, v14.4S // .....................................................................................................................*........................................... + // sqrdmulh v31.4S, v13.4S, v25.S[3] // ...........................................................................................................................*..................................... + // mul v0.4S, v13.4S, v25.S[2] // .......................................................................................................................*......................................... + // add v12.4S, v20.4S, v2.4S // ..................................................................................................................*.............................................. + // sqrdmulh v28.4S, v5.4S, v23.S[1] // ..........................................................................................................................*...................................... + // mul v15.4S, v5.4S, v23.S[0] // .........................................................................................................................*....................................... + // mls v29.4S, v11.4S, v8.S[0] // .................................................................................................*............................................................... + // add v13.4S, v12.4S, v4.4S // ........................................................................................................................*........................................ + // mls v9.4S, v21.4S, v8.S[0] // ......................................................................................................*.......................................................... + // mls v0.4S, v31.4S, v8.S[0] // .................................................................................................................................*............................... + // sub v21.4S, v12.4S, v4.4S // ............................................................................................................................*.................................... + // mls v15.4S, v28.4S, v8.S[0] // ................................................................................................................................*................................ + // srshr v30.4S, v17.4S, #23 // ..........................................................................................*...................................................................... + // sub v14.4S, v16.4S, v26.4S // ......................................................................................*.......................................................................... + // sub v10.4S, v29.4S, v9.4S // ............................................................................................................*.................................................... + // srshr v7.4S, v13.4S, #23 // .............................................................................................................................*................................... + // sub v16.4S, v0.4S, v15.4S // ........................................................................................................................................*........................ + // mul v31.4S, v14.4S, v3.S[2] // ..........................................................................................................*...................................................... + // mls v17.4S, v30.4S, v8.4S // ....................................................................................................*............................................................ + // add v5.4S, v0.4S, v15.4S // .......................................................................................................................................*......................... + // mls v13.4S, v7.4S, v8.4S // ...................................................................................................................................*............................. + // mul v19.4S, v10.4S, v3.S[2] // .....................................................................................................................................*........................... + // sqrdmulh v4.4S, v16.4S, v6.S[1] // ..............................................................................................................................................*.................. + // add v27.4S, v29.4S, v9.4S // ......................................................................................................................*.......................................... + // sqrdmulh v9.4S, v14.4S, v3.S[3] // ...........................................................................................................*..................................................... + // sqrdmulh v10.4S, v10.4S, v3.S[3] // ................................................................................................................*................................................ + // srshr v1.4S, v5.4S, #23 // ...........................................................................................................................................*..................... + // sub v12.4S, v17.4S, v13.4S // .........................................................................................................................................*....................... + // srshr v7.4S, v27.4S, #23 // ..............................................................................................................................*.................................. + // sqrdmulh v26.4S, v21.4S, v6.S[1] // ..................................................................................................................................*.............................. + // mul v18.4S, v12.4S, v3.S[0] // ............................................................................................................................................*.................... + // sqrdmulh v14.4S, v12.4S, v3.S[1] // .............................................................................................................................................*................... + // mul v0.4S, v21.4S, v6.S[0] // ....................................................................................................................................*............................ + // mls v5.4S, v1.4S, v8.4S // .................................................................................................................................................*............... + // mls v27.4S, v7.4S, v8.4S // ......................................................................................................................................*.......................... + // mul v1.4S, v16.4S, v6.S[0] // ...............................................................................................................................................*................. + // mls v18.4S, v14.4S, v8.S[0] // ..................................................................................................................................................*.............. + // mls v19.4S, v10.4S, v8.S[0] // .....................................................................................................................................................*........... + // mls v31.4S, v9.4S, v8.S[0] // ...............................................................................................................................*................................. + // mls v0.4S, v26.4S, v8.S[0] // ..........................................................................................................................................*...................... + // mls v1.4S, v4.4S, v8.S[0] // ....................................................................................................................................................*............ + // add v22.4S, v27.4S, v5.4S // .......................................................................................................................................................*......... + // add v28.4S, v17.4S, v13.4S // ..........................................................................................................................................................*...... + // str q18, [x2], #(16*4) // ........................................................................................................................................................*........ + // sub v20.4S, v27.4S, v5.4S // ............................................................................................................................................................*.... + // str q22, [x1, #16] // .............................................................................................................................................................*... + // sub v10.4S, v31.4S, v0.4S // ................................................................................................................................................*................ + // add v5.4S, v31.4S, v0.4S // ...................................................................................................................................................*............. + // str q28, [x1], #(16*4) // ..............................................................................................................................................................*.. + // add v14.4S, v19.4S, v1.4S // ...........................................................................................................................................................*..... + // sub v17.4S, v19.4S, v1.4S // ................................................................................................................................................................* + // mul v9.4S, v10.4S, v3.S[0] // .........................................................................................................................................................*....... + // sqrdmulh v30.4S, v10.4S, v3.S[1] // ...............................................................................................................................................................*. + // str q5, [x1, #-32] // ......................................................................................................................................................*.......... + + sub count, count, #1 +layer45678_start: + str q14, [x1, #-16] // .......................................................................................................................................................................*...... + add x1, x1, #64 // ............................................................................................................................................................................*. + sqrdmulh v0.4S, v17.4S, v3.S[1] // ..................................................................................................................................................................*........... + mul v19.4S, v17.4S, v3.S[0] // .................................................................................................................................................................*............ + ldr q18, [x1, #48] // ...e.......................................................................................................................................................................... + ldr q23, [x1, #32] // ..e........................................................................................................................................................................... + mul v1.4S, v20.4S, v3.S[0] // .......................................................................................................................................................*...................... + sqrdmulh v25.4S, v20.4S, v3.S[1] // ........................................................................................................................................................*..................... + // gap // .............................................................................................................................................................................. + mls v9.4S, v30.4S, v8.S[0] // ..............................................................................................................................................................*............... + ldr q27, [x1, #0] // e............................................................................................................................................................................. + ldr q14, [x1, #16] // .e............................................................................................................................................................................ + ldr q4, [x5, #176] // .......................................................e...................................................................................................................... + ldr q2, [x5, #80] // .............................e................................................................................................................................................ + mls v19.4S, v0.4S, v8.S[0] // ...................................................................................................................................................................*.......... + // gap // .............................................................................................................................................................................. + ldr q26, [x5, #128] // ....................................................e......................................................................................................................... + ldr q11, [x5, #112] // ...................................................e.......................................................................................................................... + mls v1.4S, v25.4S, v8.S[0] // .........................................................................................................................................................*.................... + // gap // .............................................................................................................................................................................. + trn2 v13.4S, v23.4S, v18.4S // .......e...................................................................................................................................................................... + ldr q17, [x5, #16] // .........................e.................................................................................................................................................... + trn1 v30.4S, v23.4S, v18.4S // ......e....................................................................................................................................................................... + str q9, [x2, #-32] // ..........................................................................................................................................................................*... + str q19, [x2, #-16] // ...........................................................................................................................................................................*.. + ldr q24, [x5, #160] // ......................................................e....................................................................................................................... + trn1 v20.4S, v27.4S, v14.4S // ....e......................................................................................................................................................................... + trn2 v14.4S, v27.4S, v14.4S // .....e........................................................................................................................................................................ + str q1, [x2, #-48] // .........................................................................................................................................................................*.... + add x2, x2, #64 // .............................................................................................................................................................................* + // gap // .............................................................................................................................................................................. + ldr q6, [x5, #144] // .....................................................e........................................................................................................................ + ldr q31, [x2, #32] // ..............e............................................................................................................................................................... + ldr q22, [x2, #48] // ...............e.............................................................................................................................................................. + trn2 v5.2D, v14.2D, v13.2D // .........e.................................................................................................................................................................... + trn2 v3.2D, v20.2D, v30.2D // ........e..................................................................................................................................................................... + ldr q29, [x2, #16] // .............e................................................................................................................................................................ + trn1 v18.2D, v14.2D, v13.2D // ...........e.................................................................................................................................................................. + ldr q25, [x2, #0] // ............e................................................................................................................................................................. + trn1 v20.2D, v20.2D, v30.2D // ..........e................................................................................................................................................................... + ldr q16, [x5, #48] // ...........................e.................................................................................................................................................. + sub v1.4S, v3.4S, v5.4S // ...................................e.......................................................................................................................................... + add v23.4S, v3.4S, v5.4S // ....................................e......................................................................................................................................... + ldr q27, [x5, #96] // ..................................................e........................................................................................................................... + sub v19.4S, v20.4S, v18.4S // ..............................e............................................................................................................................................... + add v21.4S, v20.4S, v18.4S // ...............................e.............................................................................................................................................. + ldr q10, [x5, #64] // ............................e................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v15.4S, v31.4S, v22.4S // ...................e.......................................................................................................................................................... + ldr q18, [x5, #32] // ..........................e................................................................................................................................................... + trn1 v20.4S, v31.4S, v22.4S // ..................e........................................................................................................................................................... + ldr q3, [x4], #64 // ............................................................................................e................................................................................. + add v30.4S, v21.4S, v23.4S // .........................................e.................................................................................................................................... + trn2 v9.4S, v25.4S, v29.4S // .................e............................................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v12.4S, v25.4S, v29.4S // ................e............................................................................................................................................................. + sqrdmulh v0.4S, v19.4S, v16.4S // .................................e............................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v28.2D, v9.2D, v15.2D // .......................e...................................................................................................................................................... + mul v29.4S, v1.4S, v10.4S // .....................................e........................................................................................................................................ + // gap // .............................................................................................................................................................................. + ldr q22, [x5], #(12*16) // ........................e..................................................................................................................................................... + trn1 v5.2D, v12.2D, v20.2D // ......................e....................................................................................................................................................... + // gap // .............................................................................................................................................................................. + mul v14.4S, v19.4S, v18.4S // ................................e............................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v7.2D, v9.2D, v15.2D // .....................e........................................................................................................................................................ + sqrdmulh v16.4S, v1.4S, v2.4S // ......................................e....................................................................................................................................... + trn2 v13.2D, v12.2D, v20.2D // ....................e......................................................................................................................................................... + // gap // .............................................................................................................................................................................. + sub v1.4S, v5.4S, v28.4S // ........................................................e..................................................................................................................... + // gap // .............................................................................................................................................................................. + sub v12.4S, v21.4S, v23.4S // ........................................e..................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v14.4S, v0.4S, v8.S[0] // ..................................e........................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v29.4S, v16.4S, v8.S[0] // .......................................e...................................................................................................................................... + sqrdmulh v20.4S, v1.4S, v6.4S // ...........................................................e.................................................................................................................. + mul v19.4S, v12.4S, v22.4S // ..........................................e................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v10.4S, v13.4S, v7.4S // .............................................................e................................................................................................................ + add v18.4S, v5.4S, v28.4S // .........................................................e.................................................................................................................... + mul v5.4S, v1.4S, v26.4S // ..........................................................e................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v9.4S, v10.4S, v24.4S // ...............................................................e.............................................................................................................. + ldr q6, [x4, #-48] // .............................................................................................e................................................................................ + sub v15.4S, v14.4S, v29.4S // .............................................e................................................................................................................................ + // gap // .............................................................................................................................................................................. + sqrdmulh v31.4S, v12.4S, v17.4S // ...........................................e.................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v4.4S, v10.4S, v4.4S // ................................................................e............................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v5.4S, v20.4S, v8.S[0] // ............................................................e................................................................................................................. + add v21.4S, v14.4S, v29.4S // ..............................................e............................................................................................................................... + ldr q23, [x4, #-16] // ...............................................................................................e.............................................................................. + sqrdmulh v24.4S, v15.4S, v17.4S // ................................................e............................................................................................................................. + mul v29.4S, v15.4S, v22.4S // ...............................................e.............................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v9.4S, v4.4S, v8.S[0] // .................................................................e............................................................................................................ + trn1 v14.4S, v30.4S, v21.4S // ............................................................................e................................................................................................. + add v12.4S, v13.4S, v7.4S // ..............................................................e............................................................................................................... + ldr q25, [x4, #-32] // ..............................................................................................e............................................................................... + // gap // .............................................................................................................................................................................. + mls v19.4S, v31.4S, v8.S[0] // ............................................e................................................................................................................................. + mls v29.4S, v24.4S, v8.S[0] // .................................................e............................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v0.4S, v30.4S, v21.4S // .............................................................................e................................................................................................ + // gap // .............................................................................................................................................................................. + sub v2.4S, v5.4S, v9.4S // .......................................................................e...................................................................................................... + sub v30.4S, v18.4S, v12.4S // ..................................................................e........................................................................................................... + add v22.4S, v5.4S, v9.4S // ........................................................................e..................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v2.4S, v11.4S // ..........................................................................e................................................................................................... + // gap // .............................................................................................................................................................................. + mul v5.4S, v2.4S, v27.4S // .........................................................................e.................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v26.4S, v30.4S, v11.4S // .....................................................................e........................................................................................................ + trn2 v16.4S, v19.4S, v29.4S // ...............................................................................e.............................................................................................. + trn1 v20.4S, v19.4S, v29.4S // ..............................................................................e............................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v2.4S, v30.4S, v27.4S // ....................................................................e......................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v28.2D, v0.2D, v16.2D // .................................................................................e............................................................................................ + add v19.4S, v18.4S, v12.4S // ...................................................................e.......................................................................................................... + trn2 v9.2D, v14.2D, v20.2D // ................................................................................e............................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v5.4S, v13.4S, v8.S[0] // ...........................................................................e.................................................................................................. + trn1 v29.2D, v0.2D, v16.2D // ...................................................................................e.......................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v2.4S, v26.4S, v8.S[0] // ......................................................................e....................................................................................................... + // gap // .............................................................................................................................................................................. + sub v27.4S, v9.4S, v28.4S // .....................................................................................................e........................................................................ + // gap // .............................................................................................................................................................................. + add v26.4S, v9.4S, v28.4S // ......................................................................................................e....................................................................... + trn1 v21.2D, v14.2D, v20.2D // ..................................................................................e........................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v10.4S, v19.4S, v22.4S // .....................................................................................e........................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v14.4S, v19.4S, v22.4S // ....................................................................................e......................................................................................... + // gap // .............................................................................................................................................................................. + mul v9.4S, v27.4S, v25.S[0] // .......................................................................................................e...................................................................... + trn1 v22.4S, v2.4S, v5.4S // ......................................................................................e....................................................................................... + add v16.4S, v21.4S, v29.4S // .................................................................................................e............................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v28.4S, v2.4S, v5.4S // .......................................................................................e...................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v15.4S, v21.4S, v29.4S // ................................................................................................e............................................................................. + sqrdmulh v21.4S, v27.4S, v25.S[1] // ........................................................................................................e..................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v20.2D, v14.2D, v22.2D // ..........................................................................................e................................................................................... + add v17.4S, v16.4S, v26.4S // .....................................................................................................................e........................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v2.2D, v10.2D, v28.2D // ...........................................................................................e.................................................................................. + trn2 v22.2D, v14.2D, v22.2D // ........................................................................................e..................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v14.2D, v10.2D, v28.2D // .........................................................................................e.................................................................................... + mul v29.4S, v15.4S, v6.S[2] // ..................................................................................................e........................................................................... + sub v13.4S, v20.4S, v2.4S // ..........................................................................................................e................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v11.4S, v15.4S, v6.S[3] // ...................................................................................................e.......................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v4.4S, v22.4S, v14.4S // ................................................................................................................e............................................................. + // gap // .............................................................................................................................................................................. + sub v5.4S, v22.4S, v14.4S // ...............................................................................................................e.............................................................. + sqrdmulh v31.4S, v13.4S, v25.S[3] // .............................................................................................................e................................................................ + // gap // .............................................................................................................................................................................. + mul v0.4S, v13.4S, v25.S[2] // ............................................................................................................e................................................................. + add v12.4S, v20.4S, v2.4S // ...........................................................................................................e.................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v28.4S, v5.4S, v23.S[1] // ..................................................................................................................e........................................................... + mul v15.4S, v5.4S, v23.S[0] // .................................................................................................................e............................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v29.4S, v11.4S, v8.S[0] // ....................................................................................................e......................................................................... + // gap // .............................................................................................................................................................................. + add v13.4S, v12.4S, v4.4S // ...............................................................................................................................e.............................................. + // gap // .............................................................................................................................................................................. + mls v9.4S, v21.4S, v8.S[0] // .........................................................................................................e.................................................................... + mls v0.4S, v31.4S, v8.S[0] // ..............................................................................................................e............................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v21.4S, v12.4S, v4.4S // ..............................................................................................................................e............................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v15.4S, v28.4S, v8.S[0] // ...................................................................................................................e.......................................................... + srshr v30.4S, v17.4S, #23 // ........................................................................................................................................e..................................... + // gap // .............................................................................................................................................................................. + sub v14.4S, v16.4S, v26.4S // ....................................................................................................................e......................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v10.4S, v29.4S, v9.4S // .........................................................................................................................e.................................................... + // gap // .............................................................................................................................................................................. + srshr v7.4S, v13.4S, #23 // ............................................................................................................................................e................................. + sub v16.4S, v0.4S, v15.4S // ...................................................................................................................................e.......................................... + // gap // .............................................................................................................................................................................. + mul v31.4S, v14.4S, v3.S[2] // ......................................................................................................................e....................................................... + // gap // .............................................................................................................................................................................. + mls v17.4S, v30.4S, v8.4S // .........................................................................................................................................e.................................... + add v5.4S, v0.4S, v15.4S // ....................................................................................................................................e......................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v13.4S, v7.4S, v8.4S // .............................................................................................................................................e................................ + // gap // .............................................................................................................................................................................. + mul v19.4S, v10.4S, v3.S[2] // ...........................................................................................................................e.................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v4.4S, v16.4S, v6.S[1] // ......................................................................................................................................e....................................... + add v27.4S, v29.4S, v9.4S // ..........................................................................................................................e................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v9.4S, v14.4S, v3.S[3] // .......................................................................................................................e...................................................... + sqrdmulh v10.4S, v10.4S, v3.S[3] // ............................................................................................................................e................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + srshr v1.4S, v5.4S, #23 // ..............................................................................................................................................e............................... + // gap // .............................................................................................................................................................................. + sub v12.4S, v17.4S, v13.4S // ................................................................................................................................................e............................. + // gap // .............................................................................................................................................................................. + srshr v7.4S, v27.4S, #23 // ..........................................................................................................................................e................................... + sqrdmulh v26.4S, v21.4S, v6.S[1] // .................................................................................................................................e............................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v18.4S, v12.4S, v3.S[0] // ..................................................................................................................................................e........................... + sqrdmulh v14.4S, v12.4S, v3.S[1] // ...................................................................................................................................................e.......................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v0.4S, v21.4S, v6.S[0] // ................................................................................................................................e............................................. + mls v5.4S, v1.4S, v8.4S // ...............................................................................................................................................e.............................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v27.4S, v7.4S, v8.4S // ...........................................................................................................................................e.................................. + mul v1.4S, v16.4S, v6.S[0] // .....................................................................................................................................e........................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v18.4S, v14.4S, v8.S[0] // ....................................................................................................................................................e......................... + mls v19.4S, v10.4S, v8.S[0] // .............................................................................................................................e................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v31.4S, v9.4S, v8.S[0] // ........................................................................................................................e..................................................... + mls v0.4S, v26.4S, v8.S[0] // ..................................................................................................................................e........................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v1.4S, v4.4S, v8.S[0] // .......................................................................................................................................e...................................... + // gap // .............................................................................................................................................................................. + add v22.4S, v27.4S, v5.4S // ......................................................................................................................................................e....................... + // gap // .............................................................................................................................................................................. + add v28.4S, v17.4S, v13.4S // .................................................................................................................................................e............................ + // gap // .............................................................................................................................................................................. + str q18, [x2], #(16*4) // ........................................................................................................................................................................e..... + sub v20.4S, v27.4S, v5.4S // .....................................................................................................................................................e........................ + str q22, [x1, #16] // .....................................................................................................................................................................e........ + sub v10.4S, v31.4S, v0.4S // ..........................................................................................................................................................e................... + // gap // .............................................................................................................................................................................. + add v5.4S, v31.4S, v0.4S // ...........................................................................................................................................................e.................. + str q28, [x1], #(16*4) // ....................................................................................................................................................................e......... + // gap // .............................................................................................................................................................................. + add v14.4S, v19.4S, v1.4S // ................................................................................................................................................................e............. + sub v17.4S, v19.4S, v1.4S // ...............................................................................................................................................................e.............. + mul v9.4S, v10.4S, v3.S[0] // ............................................................................................................................................................e................. + sqrdmulh v30.4S, v10.4S, v3.S[1] // .............................................................................................................................................................e................ + str q5, [x1, #-32] // ......................................................................................................................................................................e....... + // gap // .............................................................................................................................................................................. + + // original source code + // ldr q9, [x1, #0] // .....e....................................................................................................................................................................|........e................. + // ldr q10, [x1, #16] // ......e...................................................................................................................................................................|.........e................ + // ldr q11, [x1, #32] // .e........................................................................................................................................................................|....e..................... + // ldr q12, [x1, #48] // e.........................................................................................................................................................................|...e...................... + // trn1 v25.4s, v9.4s, v10.4s // ...................e......................................................................................................................................................|......................e... + // trn2 v26.4s, v9.4s, v10.4s // ....................e.....................................................................................................................................................|.......................e.. + // trn1 v27.4s, v11.4s, v12.4s // ...............e..........................................................................................................................................................|..................e....... + // trn2 v28.4s, v11.4s, v12.4s // .............e............................................................................................................................................................|................e......... + // trn2 v11.2d, v25.2d, v27.2d // ...........................e..............................................................................................................................................|.......................... + // trn2 v12.2d, v26.2d, v28.2d // ..........................e...............................................................................................................................................|.......................... + // trn1 v9.2d, v25.2d, v27.2d // ...............................e..........................................................................................................................................|.......................... + // trn1 v10.2d, v26.2d, v28.2d // .............................e............................................................................................................................................|.......................... + // ldr q13, [x2, #0] // ..............................e...........................................................................................................................................|.......................... + // ldr q14, [x2, #16] // ............................e.............................................................................................................................................|.......................... + // ldr q15, [x2, #32] // ........................e.................................................................................................................................................|.......................... + // ldr q16, [x2, #48] // .........................e................................................................................................................................................|.......................... + // trn1 v25.4s, v13.4s, v14.4s // .............................................e............................................................................................................................|.......................... + // trn2 v26.4s, v13.4s, v14.4s // ............................................e.............................................................................................................................|.......................... + // trn1 v27.4s, v15.4s, v16.4s // .........................................e................................................................................................................................|.......................... + // trn2 v28.4s, v15.4s, v16.4s // .......................................e..................................................................................................................................|.......................... + // trn2 v15.2d, v25.2d, v27.2d // ......................................................e...................................................................................................................|.......................... + // trn2 v16.2d, v26.2d, v28.2d // ....................................................e.....................................................................................................................|.......................... + // trn1 v13.2d, v25.2d, v27.2d // ..................................................e.......................................................................................................................|.......................... + // trn1 v14.2d, v26.2d, v28.2d // ...............................................e..........................................................................................................................|.......................... + // ldr q0, [x5], #(12*16) // .................................................e........................................................................................................................|.......................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ..............e...........................................................................................................................................................|.................e........ + // ldr q1, [x5, #(-12*16 + 2*16)] // ........................................e.................................................................................................................................|.......................... + // ldr q5, [x5, #(-12*16 + 3*16)] // ................................e.........................................................................................................................................|.......................... + // ldr q2, [x5, #(-12*16 + 4*16)] // ......................................e...................................................................................................................................|.......................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ........e.................................................................................................................................................................|...........e.............. + // sub v24.4s, v9.4s, v10.4s // ....................................e.....................................................................................................................................|.......................... + // add v9.4s, v9.4s, v10.4s // .....................................e....................................................................................................................................|.......................... + // mul v10.4s, v24.4s, v1.4s // ...................................................e......................................................................................................................|.......................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ..............................................e...........................................................................................................................|.......................... + // mls v10.4s, v24.4s, v8.s[0] // .........................................................e................................................................................................................|.......................... + // sub v24.4s, v11.4s, v12.4s // .................................e........................................................................................................................................|.......................... + // add v11.4s, v11.4s, v12.4s // ..................................e.......................................................................................................................................|.......................... + // mul v12.4s, v24.4s, v2.4s // ................................................e.........................................................................................................................|.......................... + // sqrdmulh v24.4s, v24.4s, v6.4s // .....................................................e....................................................................................................................|.......................... + // mls v12.4s, v24.4s, v8.s[0] // ..........................................................e...............................................................................................................|.......................... + // sub v24.4s, v9.4s, v11.4s // ........................................................e.................................................................................................................|.......................... + // add v9.4s, v9.4s, v11.4s // ...........................................e..............................................................................................................................|.......................... + // mul v11.4s, v24.4s, v0.4s // ............................................................e.............................................................................................................|.......................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................................................e......................................................................................................|.......................... + // mls v11.4s, v24.4s, v8.s[0] // ..............................................................................e...........................................................................................|.......................... + // sub v24.4s, v10.4s, v12.4s // ..................................................................e.......................................................................................................|.......................... + // add v10.4s, v10.4s, v12.4s // ......................................................................e...................................................................................................|.......................... + // mul v12.4s, v24.4s, v0.4s // .........................................................................e................................................................................................|.......................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ........................................................................e.................................................................................................|.......................... + // mls v12.4s, v24.4s, v8.s[0] // ...............................................................................e..........................................................................................|.......................... + // ldr q0, [x5, #(-12*16 + 6*16)] // ...................................e......................................................................................................................................|.......................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ...........e..............................................................................................................................................................|..............e........... + // ldr q1, [x5, #(-12*16 + 8*16)] // ..........e...............................................................................................................................................................|.............e............ + // ldr q5, [x5, #(-12*16 + 9*16)] // .......................e..................................................................................................................................................|.......................... + // ldr q2, [x5, #(-12*16 + 10*16)] // ..................e.......................................................................................................................................................|.....................e.... + // ldr q6, [x5, #(-12*16 + 11*16)] // .......e..................................................................................................................................................................|..........e............... + // sub v24.4s, v13.4s, v14.4s // .......................................................e..................................................................................................................|.......................... + // add v13.4s, v13.4s, v14.4s // ..............................................................e...........................................................................................................|.......................... + // mul v14.4s, v24.4s, v1.4s // ...............................................................e..........................................................................................................|.......................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ...........................................................e..............................................................................................................|.......................... + // mls v14.4s, v24.4s, v8.s[0] // .....................................................................e....................................................................................................|.......................... + // sub v24.4s, v15.4s, v16.4s // .............................................................e............................................................................................................|.......................... + // add v15.4s, v15.4s, v16.4s // ............................................................................e.............................................................................................|.......................... + // mul v16.4s, v24.4s, v2.4s // ................................................................e.........................................................................................................|.......................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ....................................................................e.....................................................................................................|.......................... + // mls v16.4s, v24.4s, v8.s[0] // ..........................................................................e...............................................................................................|.......................... + // sub v24.4s, v13.4s, v15.4s // ..................................................................................e.......................................................................................|.......................... + // add v13.4s, v13.4s, v15.4s // ...........................................................................................e..............................................................................|.......................... + // mul v15.4s, v24.4s, v0.4s // .........................................................................................e................................................................................|.......................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ......................................................................................e...................................................................................|.......................... + // mls v15.4s, v24.4s, v8.s[0] // ...............................................................................................e..........................................................................|.......................... + // sub v24.4s, v14.4s, v16.4s // .................................................................................e........................................................................................|.......................... + // add v14.4s, v14.4s, v16.4s // ...................................................................................e......................................................................................|.......................... + // mul v16.4s, v24.4s, v0.4s // .....................................................................................e....................................................................................|.......................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ....................................................................................e.....................................................................................|.......................... + // mls v16.4s, v24.4s, v8.s[0] // .............................................................................................e............................................................................|.......................... + // trn1 v25.4s, v9.4s, v10.4s // ...........................................................................e..............................................................................................|.......................... + // trn2 v26.4s, v9.4s, v10.4s // ................................................................................e.........................................................................................|.......................... + // trn1 v27.4s, v11.4s, v12.4s // ........................................................................................e.................................................................................|.......................... + // trn2 v28.4s, v11.4s, v12.4s // .......................................................................................e..................................................................................|.......................... + // trn2 v11.2d, v25.2d, v27.2d // ............................................................................................e.............................................................................|.......................... + // trn2 v12.2d, v26.2d, v28.2d // ..........................................................................................e...............................................................................|.......................... + // trn1 v9.2d, v25.2d, v27.2d // ..................................................................................................e.......................................................................|.......................... + // trn1 v10.2d, v26.2d, v28.2d // ..............................................................................................e...........................................................................|.......................... + // trn1 v25.4s, v13.4s, v14.4s // ....................................................................................................e.....................................................................|.......................... + // trn2 v26.4s, v13.4s, v14.4s // ...................................................................................................e......................................................................|.......................... + // trn1 v27.4s, v15.4s, v16.4s // ......................................................................................................e...................................................................|.......................... + // trn2 v28.4s, v15.4s, v16.4s // ........................................................................................................e.................................................................|.......................... + // trn2 v15.2d, v25.2d, v27.2d // ..............................................................................................................e...........................................................|.......................... + // trn2 v16.2d, v26.2d, v28.2d // ...............................................................................................................e..........................................................|.......................... + // trn1 v13.2d, v25.2d, v27.2d // ...........................................................................................................e..............................................................|.......................... + // trn1 v14.2d, v26.2d, v28.2d // .............................................................................................................e............................................................|.......................... + // ldr q0, [x4], #64 // ..........................................e...............................................................................................................................|.......................... + // ldr q1, [x4, #(-64 + 16)] // .................................................................e........................................................................................................|.......................... + // ldr q2, [x4, #(-64 + 32)] // .............................................................................e............................................................................................|.......................... + // ldr q3, [x4, #(-64 + 48)] // .......................................................................e..................................................................................................|.......................... + // sub v24.4s, v9.4s, v10.4s // .........................................................................................................e................................................................|.......................... + // add v9.4s, v9.4s, v10.4s // .......................................................................................................e..................................................................|.......................... + // mul v10.4s, v24.4s, v1.s[2] // ................................................................................................................e.........................................................|.......................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..................................................................................................................e.......................................................|.......................... + // mls v10.4s, v24.4s, v8.s[0] // ..........................................................................................................................e...............................................|.......................... + // sub v24.4s, v11.4s, v12.4s // ................................................................................................e.........................................................................|.......................... + // add v11.4s, v11.4s, v12.4s // .................................................................................................e........................................................................|.......................... + // mul v12.4s, v24.4s, v2.s[0] // .....................................................................................................e....................................................................|.......................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........................................................................................................e...............................................................|.......................... + // mls v12.4s, v24.4s, v8.s[0] // ............................................................................................................................e.............................................|.......................... + // sub v24.4s, v13.4s, v14.4s // .................................................................................................................e........................................................|.......................... + // add v13.4s, v13.4s, v14.4s // .......................................................................................................................e..................................................|.......................... + // mul v14.4s, v24.4s, v2.s[2] // ......................................................................................................................e...................................................|.......................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // .....................................................................................................................e....................................................|.......................... + // mls v14.4s, v24.4s, v8.s[0] // .............................................................................................................................e............................................|.......................... + // sub v24.4s, v15.4s, v16.4s // ....................................................................................................................e.....................................................|.......................... + // add v15.4s, v15.4s, v16.4s // ...................................................................................................................e......................................................|.......................... + // mul v16.4s, v24.4s, v3.s[0] // .........................................................................................................................e................................................|.......................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................................................................................................e.................................................|.......................... + // mls v16.4s, v24.4s, v8.s[0] // ...............................................................................................................................e..........................................|.......................... + // sub v24.4s, v9.4s, v11.4s // .................................................................................................................................e........................................|.......................... + // add v9.4s, v9.4s, v11.4s // ............................................................................................................e.............................................................|.......................... + // mul v11.4s, v24.4s, v0.s[2] // .....................................................................................................................................e....................................|.......................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ............................................................................................................................................e.............................|.......................... + // mls v11.4s, v24.4s, v8.s[0] // ..........................................................................................................................................................e...............|.......................... + // sub v24.4s, v10.4s, v12.4s // ..................................................................................................................................e.......................................|.......................... + // add v10.4s, v10.4s, v12.4s // ...........................................................................................................................................e..............................|.......................... + // mul v12.4s, v24.4s, v0.s[2] // .........................................................................................................................................e................................|.......................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................................................................................................................e............................|.......................... + // mls v12.4s, v24.4s, v8.s[0] // .........................................................................................................................................................e................|.......................... + // sub v24.4s, v13.4s, v15.4s // ..............................................................................................................................e...........................................|.......................... + // add v13.4s, v13.4s, v15.4s // ...........................................................................................................................e..............................................|.......................... + // mul v15.4s, v24.4s, v1.s[0] // ....................................................................................................................................................e.....................|.......................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .................................................................................................................................................e........................|.......................... + // mls v15.4s, v24.4s, v8.s[0] // ...........................................................................................................................................................e..............|.......................... + // sub v24.4s, v14.4s, v16.4s // ....................................................................................................................................e.....................................|.......................... + // add v14.4s, v14.4s, v16.4s // .......................................................................................................................................e..................................|.......................... + // mul v16.4s, v24.4s, v1.s[0] // .......................................................................................................................................................e..................|.......................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................................................................................................................................e...............................|.......................... + // mls v16.4s, v24.4s, v8.s[0] // ............................................................................................................................................................e.............|.......................... + // srshr v24.4S, v9.4S, #23 // ................................................................................................................................e.........................................|.......................... + // mls v9.4s, v24.4s, v8.4s // ......................................................................................................................................e...................................|.......................... + // srshr v24.4S, v10.4S, #23 // ................................................................................................................................................e.........................|.......................... + // mls v10.4s, v24.4s, v8.4s // ......................................................................................................................................................e...................|.......................... + // srshr v24.4S, v13.4S, #23 // ...................................................................................................................................e......................................|.......................... + // mls v13.4s, v24.4s, v8.4s // ........................................................................................................................................e.................................|.......................... + // srshr v24.4S, v14.4S, #23 // ..............................................................................................................................................e...........................|.......................... + // mls v14.4s, v24.4s, v8.4s // .....................................................................................................................................................e....................|.......................... + // sub v24.4s, v9.4s, v13.4s // ...............................................................................................................................................e..........................|.......................... + // add v9.4s, v9.4s, v13.4s // ..............................................................................................................................................................e...........|.......................... + // mul v13.4s, v24.4s, v0.s[0] // ..................................................................................................................................................e.......................|.......................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................e......................|.......................... + // mls v13.4s, v24.4s, v8.s[0] // ........................................................................................................................................................e.................|.......................... + // sub v24.4s, v10.4s, v14.4s // ................................................................................................................................................................e.........|.......................... + // add v10.4s, v10.4s, v14.4s // .............................................................................................................................................................e............|.......................... + // mul v14.4s, v24.4s, v0.s[0] // ..*.......................................................................................................................................................................|.....*.................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...*......................................................................................................................................................................|......*................... + // mls v14.4s, v24.4s, v8.s[0] // ............*.............................................................................................................................................................|...............*.......... + // sub v24.4s, v11.4s, v15.4s // ..................................................................................................................................................................e.......|.......................... + // add v11.4s, v11.4s, v15.4s // ...................................................................................................................................................................e......|.......................... + // mul v15.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................e..|.......................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................................................................................................................................e.|.......................... + // mls v15.4s, v24.4s, v8.s[0] // ....*.....................................................................................................................................................................|.......*.................. + // sub v24.4s, v12.4s, v16.4s // ......................................................................................................................................................................e...|.......................... + // add v12.4s, v12.4s, v16.4s // .....................................................................................................................................................................e....|.......................... + // mul v16.4s, v24.4s, v0.s[0] // ..........................................................................................................................................................................|..*....................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........................................................................................................................................................................|.*........................ + // mls v16.4s, v24.4s, v8.s[0] // .........*................................................................................................................................................................|............*............. + // str q9, [x1], #(16*4) // ....................................................................................................................................................................e.....|.......................... + // str q10, [x1, #(-16*4 + 1*16)] // .................................................................................................................................................................e........|.......................... + // str q11, [x1, #(-16*4 + 2*16)] // .........................................................................................................................................................................e|.......................... + // str q12, [x1, #(-16*4 + 3*16)] // ..........................................................................................................................................................................*.......................... + // str q13, [x2], #(16*4) // ...............................................................................................................................................................e..........|.......................... + // str q14, [x2, #(-16*4 + 1*16)] // .....................*....................................................................................................................................................|........................*. + // str q15, [x2, #(-16*4 + 2*16)] // ................*.........................................................................................................................................................|...................*...... + // str q16, [x2, #(-16*4 + 3*16)] // .................*........................................................................................................................................................|....................*..... + // add x1, x1, #64 // ..........................................................................................................................................................................|*......................... + // add x2, x2, #64 // ......................*...................................................................................................................................................|.........................* + + sub count, count, #1 + cbnz count, layer45678_start + // gap // ............. + // gap // ............. + sqrdmulh v19.4S, v17.4S, v3.S[1] // ..*.......... + mul v17.4S, v17.4S, v3.S[0] // ...*......... + // gap // ............. + // gap // ............. + mul v29.4S, v20.4S, v3.S[0] // ....*........ + sqrdmulh v10.4S, v20.4S, v3.S[1] // .....*....... + // gap // ............. + // gap // ............. + // gap // ............. + mls v9.4S, v30.4S, v8.S[0] // ......*...... + // gap // ............. + // gap // ............. + // gap // ............. + mls v17.4S, v19.4S, v8.S[0] // .......*..... + // gap // ............. + // gap // ............. + str q14, [x1, #-16] // *............ + mls v29.4S, v10.4S, v8.S[0] // ........*.... + // gap // ............. + // gap // ............. + // gap // ............. + str q9, [x2, #-32] // .........*... + // gap // ............. + // gap // ............. + // gap // ............. + str q17, [x2, #-16] // ..........*.. + // gap // ............. + add x1, x1, #64 // .*........... + str q29, [x2, #-48] // ...........*. + add x2, x2, #64 // ............* + + // original source code + // str q14, [x1, #-16] // ......*...... + // add x1, x1, #64 // ..........*.. + // sqrdmulh v0.4S, v17.4S, v3.S[1] // *............ + // mul v19.4S, v17.4S, v3.S[0] // .*........... + // mul v1.4S, v20.4S, v3.S[0] // ..*.......... + // sqrdmulh v25.4S, v20.4S, v3.S[1] // ...*......... + // mls v9.4S, v30.4S, v8.S[0] // ....*........ + // mls v19.4S, v0.4S, v8.S[0] // .....*....... + // mls v1.4S, v25.4S, v8.S[0] // .......*..... + // str q9, [x2, #-32] // ........*.... + // str q19, [x2, #-16] // .........*... + // str q1, [x2, #-48] // ...........*. + // add x2, x2, #64 // ............* + + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + ldr q28, [x0, #256] // .....*........ + // gap // .............. + // gap // .............. + ldr q9, [x0, #384] // .*............ + ldr q14, [x0, #512] // ..*........... + // gap // .............. + // gap // .............. + ldr q19, [x0, #640] // ....*......... + ldr q22, [x0, #768] // ...*.......... + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + ldr q16, [x0, #896] // *............. + sub v15.4S, v28.4S, v9.4S // .......*...... + add v6.4S, v28.4S, v9.4S // .............* + // gap // .............. + // gap // .............. + add v9.4S, v14.4S, v19.4S // ............*. + // gap // .............. + // gap // .............. + sub v4.4S, v14.4S, v19.4S // ......*....... + // gap // .............. + mul v17.4S, v15.4S, v2.S[0] // ...........*.. + sqrdmulh v21.4S, v15.4S, v2.S[1] // ..........*... + // gap // .............. + // gap // .............. + // gap // .............. + mul v28.4S, v4.4S, v2.S[2] // ........*..... + add v7.4S, v22.4S, v16.4S // .........*.... + + // original source code + // ldr q16, [x0, #896] // .....*........ + // ldr q27, [x0, #384] // .*............ + // ldr q20, [x0, #512] // ..*........... + // ldr q22, [x0, #768] // ....*......... + // ldr q6, [x0, #640] // ...*.......... + // ldr q18, [x0, #256] // *............. + // sub v4.4S, v20.4S, v6.4S // .........*.... + // sub v19.4S, v18.4S, v27.4S // ......*....... + // mul v28.4S, v4.4S, v2.S[2] // ............*. + // add v7.4S, v22.4S, v16.4S // .............* + // sqrdmulh v21.4S, v19.4S, v2.S[1] // ...........*.. + // mul v17.4S, v19.4S, v2.S[0] // ..........*... + // add v9.4S, v20.4S, v6.4S // ........*..... + // add v6.4S, v18.4S, v27.4S // .......*...... + + sub count, count, #1 +layer123_start: + sqrdmulh v19.4S, v4.4S, v2.S[3] // .....................*.................................................................................................. + ldr q20, [x0, #128] // .*...................................................................................................................... + sub v23.4S, v22.4S, v16.4S // .......................*................................................................................................ + ldr q4, [x0, #0] // *....................................................................................................................... + mls v17.4S, v21.4S, v8.S[0] // .................*...................................................................................................... + // gap // ........................................................................................................................ + ldr q16, [x0, #912] // .......e................................................................................................................ + sub v10.4S, v9.4S, v7.4S // ......................................*................................................................................. + ldr q27, [x0, #400] // ...e.................................................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v5.4S, v23.4S, v3.S[1] // ..........................*............................................................................................. + add v24.4S, v9.4S, v7.4S // .......................................*................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v15.4S, v23.4S, v3.S[0] // .........................*.............................................................................................. + mls v28.4S, v19.4S, v8.S[0] // ......................*................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v19.4S, v4.4S, v20.4S // .........*.............................................................................................................. + sqrdmulh v11.4S, v10.4S, v1.S[1] // .........................................*.............................................................................. + mul v13.4S, v10.4S, v1.S[0] // ........................................*............................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v22.4S, v4.4S, v20.4S // ........*............................................................................................................... + add v4.4S, v19.4S, v6.4S // .............................*.......................................................................................... + ldr q20, [x0, #528] // ....e................................................................................................................... + // gap // ........................................................................................................................ + mls v15.4S, v5.4S, v8.S[0] // ...........................*............................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v12.4S, v22.4S, v1.S[2] // ..........*............................................................................................................. + sqrdmulh v10.4S, v22.4S, v1.S[3] // ...........*............................................................................................................ + add v9.4S, v4.4S, v24.4S // .................................................*...................................................................... + sub v7.4S, v19.4S, v6.4S // ............................*........................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v14.4S, v4.4S, v24.4S // ................................................*....................................................................... + sub v6.4S, v28.4S, v15.4S // ...........................................*............................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v21.4S, v7.4S, v0.S[3] // ...............................*........................................................................................ + mls v12.4S, v10.4S, v8.S[0] // ............*........................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v18.4S, v6.4S, v1.S[0] // .............................................*.......................................................................... + sqrdmulh v4.4S, v6.4S, v1.S[1] // ..............................................*......................................................................... + ldr q22, [x0, #784] // ......e................................................................................................................. + // gap // ........................................................................................................................ + sqrdmulh v10.4S, v14.4S, v0.S[1] // ...................................................*.................................................................... + mul v5.4S, v7.4S, v0.S[2] // ..............................*......................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v23.4S, v9.4S, v26.4S // .........................................................................................*.............................. + add v24.4S, v28.4S, v15.4S // ............................................*........................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v18.4S, v4.4S, v8.S[0] // ...............................................*........................................................................ + add v6.4S, v12.4S, v17.4S // ..................................*..................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v5.4S, v21.4S, v8.S[0] // ................................*....................................................................................... + sub v4.4S, v12.4S, v17.4S // .................................*...................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v19.4S, v6.4S, v24.4S // ......................................................*................................................................. + mls v13.4S, v11.4S, v8.S[0] // ..........................................*............................................................................. + sub v7.4S, v6.4S, v24.4S // .....................................................*.................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v12.4S, v9.4S, v25.4S // ........................................................................................*............................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v15.4S, v19.4S, v25.4S // ...........................................................................................*............................ + sqrdmulh v24.4S, v19.4S, v26.4S // ............................................................................................*........................... + sqrdmulh v6.4S, v4.4S, v0.S[3] // ....................................*................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v9.4S, v5.4S, v13.4S // ...........................................................*............................................................ + mul v19.4S, v7.4S, v0.S[0] // .......................................................*................................................................ + mul v28.4S, v4.4S, v0.S[2] // ...................................*.................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v4.4S, v9.4S, v26.4S // ...............................................................................................*........................ + mls v15.4S, v24.4S, v8.S[0] // .............................................................................................*.......................... + sub v21.4S, v5.4S, v13.4S // ..........................................................*............................................................. + mul v13.4S, v9.4S, v25.4S // ..............................................................................................*......................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v9.4S, v7.4S, v0.S[1] // ........................................................*............................................................... + mls v28.4S, v6.4S, v8.S[0] // .....................................*.................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v24.4S, v31.4S, v15.4S // ........................................................................................................*............... + mul v14.4S, v14.4S, v0.S[0] // ..................................................*..................................................................... + // gap // ........................................................................................................................ + ldr q6, [x0, #656] // .....e.................................................................................................................. + mls v13.4S, v4.4S, v8.S[0] // ................................................................................................*....................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v7.4S, v15.4S, v30.4S // .........................................................................................................*.............. + add v11.4S, v28.4S, v18.4S // ................................................................*....................................................... + sub v4.4S, v28.4S, v18.4S // ...............................................................*........................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v14.4S, v10.4S, v8.S[0] // ....................................................*................................................................... + mul v5.4S, v21.4S, v0.S[0] // ............................................................*........................................................... + // gap // ........................................................................................................................ + cmge v28.4S, v13.4S, v30.4S // .............................................................................................................*.......... + sqrdmulh v10.4S, v11.4S, v26.4S // ..................................................................................................*..................... + // gap // ........................................................................................................................ + cmge v17.4S, v31.4S, v13.4S // ............................................................................................................*........... + mul v18.4S, v11.4S, v25.4S // .................................................................................................*...................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v19.4S, v9.4S, v8.S[0] // .........................................................*.............................................................. + mul v9.4S, v4.4S, v0.S[0] // .................................................................*...................................................... + // gap // ........................................................................................................................ + sub v28.4S, v17.4S, v28.4S // ..............................................................................................................*......... + // gap // ........................................................................................................................ + sub v24.4S, v24.4S, v7.4S // ..........................................................................................................*............. + // gap // ........................................................................................................................ + sqrdmulh v17.4S, v4.4S, v0.S[1] // ..................................................................*..................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v18.4S, v10.4S, v8.S[0] // ...................................................................................................*.................... + mls v13.4S, v28.4S, v29.4S // ...............................................................................................................*........ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v10.4S, v31.4S, v14.4S // ....................................................................*................................................... + sqrdmulh v7.4S, v21.4S, v0.S[1] // .............................................................*.......................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v4.4S, v19.4S, v30.4S // .........................................................................*.............................................. + cmge v11.4S, v31.4S, v19.4S // ........................................................................*............................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v28.4S, v18.4S, v30.4S // .................................................................................................................*...... + str q13, [x0, #256] // ......................................................................................................................*. + // gap // ........................................................................................................................ + cmge v13.4S, v31.4S, v18.4S // ................................................................................................................*....... + mls v12.4S, v23.4S, v8.S[0] // ..........................................................................................*............................. + mls v5.4S, v7.4S, v8.S[0] // ..............................................................*......................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v23.4S, v14.4S, v30.4S // .....................................................................*.................................................. + mls v9.4S, v17.4S, v8.S[0] // ...................................................................*.................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v7.4S, v13.4S, v28.4S // ..................................................................................................................*..... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v28.4S, v11.4S, v4.4S // ..........................................................................*............................................. + sub v23.4S, v10.4S, v23.4S // ......................................................................*................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v13.4S, v31.4S, v5.4S // ............................................................................*........................................... + cmge v4.4S, v31.4S, v12.4S // ....................................................................................................*................... + cmge v21.4S, v31.4S, v9.4S // ................................................................................*....................................... + mls v18.4S, v7.4S, v29.4S // ...................................................................................................................*.... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v17.4S, v5.4S, v30.4S // .............................................................................*.......................................... + cmge v7.4S, v9.4S, v30.4S // .................................................................................*...................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v11.4S, v12.4S, v30.4S // .....................................................................................................*.................. + // gap // ........................................................................................................................ + mls v14.4S, v23.4S, v29.4S // .......................................................................*................................................ + // gap // ........................................................................................................................ + str q18, [x0, #384] // .......................................................................................................................* + ldr q18, [x0, #272] // ..e..................................................................................................................... + sub v17.4S, v13.4S, v17.4S // ..............................................................................*......................................... + mls v19.4S, v28.4S, v29.4S // ...........................................................................*............................................ + sub v28.4S, v21.4S, v7.4S // ..................................................................................*..................................... + mls v15.4S, v24.4S, v29.4S // ...........................................................................................................*............ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v21.4S, v4.4S, v11.4S // ......................................................................................................*................. + // gap // ........................................................................................................................ + str q14, [x0, #512] // ....................................................................................*................................... + mls v5.4S, v17.4S, v29.4S // ...............................................................................*........................................ + mls v9.4S, v28.4S, v29.4S // ...................................................................................*.................................... + // gap // ........................................................................................................................ + sub v4.4S, v20.4S, v6.4S // ..................e..................................................................................................... + str q19, [x0, #640] // .....................................................................................*.................................. + mls v12.4S, v21.4S, v29.4S // .......................................................................................................*................ + // gap // ........................................................................................................................ + sub v19.4S, v18.4S, v27.4S // .............e.......................................................................................................... + str q15, [x0, #128] // .....................................................................................................................*.. + str q5, [x0, #768] // ......................................................................................*................................. + // gap // ........................................................................................................................ + mul v28.4S, v4.4S, v2.S[2] // ....................e................................................................................................... + add v7.4S, v22.4S, v16.4S // ........................e............................................................................................... + str q9, [x0, #896] // .......................................................................................*................................ + sqrdmulh v21.4S, v19.4S, v2.S[1] // ................e....................................................................................................... + mul v17.4S, v19.4S, v2.S[0] // ...............e........................................................................................................ + // gap // ........................................................................................................................ + str q12, [x0], #(16) // ....................................................................................................................*... + add v9.4S, v20.4S, v6.4S // ...................e.................................................................................................... + // gap // ........................................................................................................................ + add v6.4S, v18.4S, v27.4S // ..............e......................................................................................................... + + // original source code + // ldr q9, [x0, #0] // ...................................................................................................................|..*.................................................................................................................. + // ldr q10, [x0, #(1*(1024/8))] // ...................................................................................................................|*.................................................................................................................... + // ldr q11, [x0, #(2*(1024/8))] // ............................................................................................e......................|................................................................................................e.................... + // ldr q12, [x0, #(3*(1024/8))] // ..e................................................................................................................|......e.............................................................................................................. + // ldr q13, [x0, #(4*(1024/8))] // ............e......................................................................................................|................e.................................................................................................... + // ldr q14, [x0, #(5*(1024/8))] // ...................................................e...............................................................|.......................................................e............................................................. + // ldr q15, [x0, #(6*(1024/8))] // ........................e..........................................................................................|............................e........................................................................................ + // ldr q16, [x0, #(7*(1024/8))] // e..................................................................................................................|....e................................................................................................................ + // sub v24.4s, v9.4s, v10.4s // ..........*........................................................................................................|..............*...................................................................................................... + // add v9.4s, v9.4s, v10.4s // .......*...........................................................................................................|...........*......................................................................................................... + // mul v10.4s, v24.4s, v1.s[2] // ..............*....................................................................................................|..................*.................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...............*...................................................................................................|...................*................................................................................................. + // mls v10.4s, v24.4s, v8.s[0] // .....................*.............................................................................................|.........................*........................................................................................... + // sub v24.4s, v11.4s, v12.4s // ........................................................................................................e..........|............................................................................................................e........ + // add v11.4s, v11.4s, v12.4s // ..................................................................................................................e|..................................................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ...............................................................................................................e...|...................................................................................................................e. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..............................................................................................................e....|..................................................................................................................e.. + // mls v12.4s, v24.4s, v8.s[0] // ...................................................................................................................|...*................................................................................................................. + // sub v24.4s, v13.4s, v14.4s // .....................................................................................................e.............|.........................................................................................................e........... + // add v13.4s, v13.4s, v14.4s // .................................................................................................................e.|..................................................................................................................... + // mul v14.4s, v24.4s, v2.s[2] // ...........................................................................................................e.......|...............................................................................................................e..... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...................................................................................................................*..................................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ......*............................................................................................................|..........*.......................................................................................................... + // sub v24.4s, v15.4s, v16.4s // ...................................................................................................................|.*................................................................................................................... + // add v15.4s, v15.4s, v16.4s // ............................................................................................................e......|................................................................................................................e.... + // mul v16.4s, v24.4s, v3.s[0] // .....*.............................................................................................................|.........*........................................................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...*...............................................................................................................|.......*............................................................................................................. + // mls v16.4s, v24.4s, v8.s[0] // .............*.....................................................................................................|.................*................................................................................................... + // sub v24.4s, v9.4s, v11.4s // .................*.................................................................................................|.....................*............................................................................................... + // add v9.4s, v9.4s, v11.4s // ...........*.......................................................................................................|...............*..................................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ..........................*........................................................................................|..............................*...................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ....................*..............................................................................................|........................*............................................................................................ + // mls v11.4s, v24.4s, v8.s[0] // ...............................*...................................................................................|...................................*................................................................................. + // sub v24.4s, v10.4s, v12.4s // ................................*..................................................................................|....................................*................................................................................ + // add v10.4s, v10.4s, v12.4s // ..............................*....................................................................................|..................................*.................................................................................. + // mul v12.4s, v24.4s, v0.s[2] // ..........................................*........................................................................|..............................................*...................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................*...........................................................................|...........................................*......................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ................................................*..................................................................|....................................................*................................................................ + // sub v24.4s, v13.4s, v15.4s // .*.................................................................................................................|.....*............................................................................................................... + // add v13.4s, v13.4s, v15.4s // ....*..............................................................................................................|........*............................................................................................................ + // mul v15.4s, v24.4s, v1.s[0] // .........*.........................................................................................................|.............*....................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........*..........................................................................................................|............*........................................................................................................ + // mls v15.4s, v24.4s, v8.s[0] // ..................................*................................................................................|......................................*.............................................................................. + // sub v24.4s, v14.4s, v16.4s // ...................*...............................................................................................|.......................*............................................................................................. + // add v14.4s, v14.4s, v16.4s // ............................*......................................................................................|................................*.................................................................................... + // mul v16.4s, v24.4s, v1.s[0] // ......................*............................................................................................|..........................*.......................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .......................*...........................................................................................|...........................*......................................................................................... + // mls v16.4s, v24.4s, v8.s[0] // .............................*.....................................................................................|.................................*................................................................................... + // sub v24.4s, v9.4s, v13.4s // ..................*................................................................................................|......................*.............................................................................................. + // add v9.4s, v9.4s, v13.4s // ................*..................................................................................................|....................*................................................................................................ + // mul v13.4s, v24.4s, v0.s[0] // ..................................................*................................................................|......................................................*.............................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................*.........................................................................................|.............................*....................................................................................... + // mls v13.4s, v24.4s, v8.s[0] // ........................................................*..........................................................|............................................................*........................................................ + // sub v24.4s, v10.4s, v14.4s // ...................................*...............................................................................|.......................................*............................................................................. + // add v10.4s, v10.4s, v14.4s // .................................*.................................................................................|.....................................*............................................................................... + // mul v14.4s, v24.4s, v0.s[0] // .........................................*.........................................................................|.............................................*....................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................*...................................................................|...................................................*................................................................. + // mls v14.4s, v24.4s, v8.s[0] // ..............................................................*....................................................|..................................................................*.................................................. + // sub v24.4s, v11.4s, v15.4s // .............................................*.....................................................................|.................................................*................................................................... + // add v11.4s, v11.4s, v15.4s // ........................................*..........................................................................|............................................*........................................................................ + // mul v15.4s, v24.4s, v0.s[0] // .........................................................*.........................................................|.............................................................*....................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................*............................................|..........................................................................*.......................................... + // mls v15.4s, v24.4s, v8.s[0] // .............................................................................*.....................................|.................................................................................*................................... + // sub v24.4s, v12.4s, v16.4s // .......................................................*...........................................................|...........................................................*......................................................... + // add v12.4s, v12.4s, v16.4s // ......................................................*............................................................|..........................................................*.......................................................... + // mul v16.4s, v24.4s, v0.s[0] // ...............................................................*...................................................|...................................................................*................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................*................................................|......................................................................*.............................................. + // mls v16.4s, v24.4s, v8.s[0] // ...............................................................................*...................................|...................................................................................*................................. + // cmge v27.4s, v31.4s, v13.4s // .....................................................................*.............................................|.........................................................................*........................................... + // cmge v28.4s, v13.4s, v30.4s // ..............................................................................*....................................|..................................................................................*.................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................................*................................|......................................................................................*.............................. + // mls v13.4s, v28.4s, v29.4s // ..........................................................................................*........................|..............................................................................................*...................... + // cmge v27.4s, v31.4s, v14.4s // ........................................................................*..........................................|............................................................................*........................................ + // cmge v28.4s, v14.4s, v30.4s // .......................................................................*...........................................|...........................................................................*......................................... + // sub v28.4s, v27.4s, v28.4s // .................................................................................*.................................|.....................................................................................*............................... + // mls v14.4s, v28.4s, v29.4s // ..............................................................................................*....................|..................................................................................................*.................. + // cmge v27.4s, v31.4s, v15.4s // ...................................................................................*...............................|.......................................................................................*............................. + // cmge v28.4s, v15.4s, v30.4s // .......................................................................................*...........................|...........................................................................................*......................... + // sub v28.4s, v27.4s, v28.4s // .............................................................................................*.....................|.................................................................................................*................... + // mls v15.4s, v28.4s, v29.4s // ...................................................................................................*...............|.......................................................................................................*............. + // cmge v27.4s, v31.4s, v16.4s // .....................................................................................*.............................|.........................................................................................*........................... + // cmge v28.4s, v16.4s, v30.4s // ........................................................................................*..........................|............................................................................................*........................ + // sub v28.4s, v27.4s, v28.4s // ...............................................................................................*...................|...................................................................................................*................. + // mls v16.4s, v28.4s, v29.4s // ....................................................................................................*..............|........................................................................................................*............ + // str q13, [x0, #(4*(1024/8))] // ..................................................................................................*................|......................................................................................................*.............. + // str q14, [x0, #(5*(1024/8))] // ......................................................................................................*............|..........................................................................................................*.......... + // str q15, [x0, #(6*(1024/8))] // ..........................................................................................................*........|..............................................................................................................*...... + // str q16, [x0, #(7*(1024/8))] // .............................................................................................................*.....|.................................................................................................................*... + // mul v13.4s, v9.4s, v25.4s // ....................................*..............................................................................|........................................*............................................................................ + // sqrdmulh v9.4s, v9.4s, v26.4s // ...........................*.......................................................................................|...............................*..................................................................................... + // mls v13.4s, v9.4s, v8.s[0] // ............................................................................*......................................|................................................................................*.................................... + // mul v14.4s, v10.4s, v25.4s // .....................................*.............................................................................|.........................................*........................................................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ......................................*............................................................................|..........................................*.......................................................................... + // mls v14.4s, v10.4s, v8.s[0] // ............................................*......................................................................|................................................*.................................................................... + // mul v15.4s, v11.4s, v25.4s // ..............................................*....................................................................|..................................................*.................................................................. + // sqrdmulh v11.4s, v11.4s, v26.4s // ...........................................*.......................................................................|...............................................*..................................................................... + // mls v15.4s, v11.4s, v8.s[0] // ....................................................*..............................................................|........................................................*............................................................ + // mul v16.4s, v12.4s, v25.4s // .............................................................*.....................................................|.................................................................*................................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ...........................................................*.......................................................|...............................................................*..................................................... + // mls v16.4s, v12.4s, v8.s[0] // ...................................................................*...............................................|.......................................................................*............................................. + // cmge v27.4s, v31.4s, v13.4s // ....................................................................................*..............................|........................................................................................*............................ + // cmge v28.4s, v13.4s, v30.4s // .........................................................................................*.........................|.............................................................................................*....................... + // sub v28.4s, v27.4s, v28.4s // .................................................................................................*.................|.....................................................................................................*............... + // mls v13.4s, v28.4s, v29.4s // .......................................................................................................*...........|...........................................................................................................*......... + // cmge v27.4s, v31.4s, v14.4s // .................................................*.................................................................|.....................................................*............................................................... + // cmge v28.4s, v14.4s, v30.4s // .....................................................*.............................................................|.........................................................*........................................................... + // sub v28.4s, v27.4s, v28.4s // .................................................................*.................................................|.....................................................................*............................................... + // mls v14.4s, v28.4s, v29.4s // ................................................................................................*..................|....................................................................................................*................ + // cmge v27.4s, v31.4s, v15.4s // ............................................................*......................................................|................................................................*.................................................... + // cmge v28.4s, v15.4s, v30.4s // ..........................................................*........................................................|..............................................................*...................................................... + // sub v28.4s, v27.4s, v28.4s // ................................................................*..................................................|....................................................................*................................................ + // mls v15.4s, v28.4s, v29.4s // ....................................................................*..............................................|........................................................................*............................................ + // cmge v27.4s, v31.4s, v16.4s // ...........................................................................*.......................................|...............................................................................*..................................... + // cmge v28.4s, v16.4s, v30.4s // .........................................................................*.........................................|.............................................................................*....................................... + // sub v28.4s, v27.4s, v28.4s // ................................................................................*..................................|....................................................................................*................................ + // mls v16.4s, v28.4s, v29.4s // ......................................................................................*............................|..........................................................................................*.......................... + // str q13, [x0], #(16) // ................................................................................................................*..|....................................................................................................................* + // str q14, [x0, #(-16 + 1*(1024/8))] // .........................................................................................................*.........|.............................................................................................................*....... + // str q15, [x0, #(-16 + 2*(1024/8))] // ..........................................................................*........................................|..............................................................................*...................................... + // str q16, [x0, #(-16 + 3*(1024/8))] // ...........................................................................................*.......................|...............................................................................................*..................... + + sub count, count, #1 + cbnz count, layer123_start + sqrdmulh v15.4S, v4.4S, v2.S[3] // *......................................................................................................... + ldr q13, [x0, #128] // .*........................................................................................................ + sub v16.4S, v22.4S, v16.4S // ..*....................................................................................................... + ldr q19, [x0, #0] // ...*...................................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mls v17.4S, v21.4S, v8.S[0] // ....*..................................................................................................... + add v12.4S, v9.4S, v7.4S // .......*.................................................................................................. + sqrdmulh v18.4S, v16.4S, v3.S[1] // ......*................................................................................................... + mul v4.4S, v16.4S, v3.S[0] // ........*................................................................................................. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mls v28.4S, v15.4S, v8.S[0] // .........*................................................................................................ + sub v11.4S, v9.4S, v7.4S // .....*.................................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + add v23.4S, v19.4S, v13.4S // ..........*............................................................................................... + sub v22.4S, v19.4S, v13.4S // .............*............................................................................................ + mul v7.4S, v11.4S, v1.S[0] // ............*............................................................................................. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mls v4.4S, v18.4S, v8.S[0] // ...............*.......................................................................................... + mul v14.4S, v22.4S, v1.S[2] // ................*......................................................................................... + // gap // .......................................................................................................... + sqrdmulh v21.4S, v22.4S, v1.S[3] // .................*........................................................................................ + // gap // .......................................................................................................... + add v24.4S, v23.4S, v6.4S // ..............*........................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sub v5.4S, v23.4S, v6.4S // ...................*...................................................................................... + // gap // .......................................................................................................... + sqrdmulh v9.4S, v11.4S, v1.S[1] // ...........*.............................................................................................. + // gap // .......................................................................................................... + sub v27.4S, v28.4S, v4.4S // .....................*.................................................................................... + sqrdmulh v23.4S, v5.4S, v0.S[3] // ......................*................................................................................... + // gap // .......................................................................................................... + mls v14.4S, v21.4S, v8.S[0] // .......................*.................................................................................. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mul v5.4S, v5.4S, v0.S[2] // ...........................*.............................................................................. + // gap // .......................................................................................................... + sqrdmulh v20.4S, v27.4S, v1.S[1] // .........................*................................................................................ + // gap // .......................................................................................................... + mls v7.4S, v9.4S, v8.S[0] // ...................................*...................................................................... + mul v19.4S, v27.4S, v1.S[0] // ........................*................................................................................. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sub v16.4S, v14.4S, v17.4S // .................................*........................................................................ + // gap // .......................................................................................................... + add v18.4S, v24.4S, v12.4S // ..................*....................................................................................... + add v21.4S, v14.4S, v17.4S // ...............................*.......................................................................... + mls v5.4S, v23.4S, v8.S[0] // ................................*......................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mul v23.4S, v16.4S, v0.S[2] // ...........................................*.............................................................. + // gap // .......................................................................................................... + sqrdmulh v15.4S, v16.4S, v0.S[3] // ........................................*................................................................. + mls v19.4S, v20.4S, v8.S[0] // ..............................*........................................................................... + // gap // .......................................................................................................... + add v28.4S, v28.4S, v4.4S // .............................*............................................................................ + // gap // .......................................................................................................... + add v4.4S, v5.4S, v7.4S // .........................................*................................................................ + sub v9.4S, v5.4S, v7.4S // ..............................................*........................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mls v23.4S, v15.4S, v8.S[0] // .................................................*........................................................ + // gap // .......................................................................................................... + // gap // .......................................................................................................... + add v13.4S, v21.4S, v28.4S // ..................................*....................................................................... + mul v15.4S, v9.4S, v0.S[0] // .........................................................*................................................ + sqrdmulh v5.4S, v9.4S, v0.S[1] // ......................................................................*................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sub v16.4S, v21.4S, v28.4S // ....................................*..................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sqrdmulh v10.4S, v4.4S, v26.4S // ............................................*............................................................. + sub v9.4S, v23.4S, v19.4S // .......................................................*.................................................. + sqrdmulh v17.4S, v18.4S, v26.4S // ............................*............................................................................. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mls v15.4S, v5.4S, v8.S[0] // .............................................................................*............................ + mul v7.4S, v18.4S, v25.4S // .....................................*.................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sqrdmulh v5.4S, v9.4S, v0.S[1] // ..................................................................*....................................... + // gap // .......................................................................................................... + mul v27.4S, v9.4S, v0.S[0] // ...............................................................*.......................................... + // gap // .......................................................................................................... + sqrdmulh v28.4S, v16.4S, v0.S[1] // ................................................*......................................................... + // gap // .......................................................................................................... + mul v22.4S, v16.4S, v0.S[0] // ..........................................*............................................................... + sub v18.4S, v24.4S, v12.4S // ....................*..................................................................................... + // gap // .......................................................................................................... + mls v7.4S, v17.4S, v8.S[0] // ............................................................................*............................. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mul v21.4S, v13.4S, v25.4S // ......................................*................................................................... + // gap // .......................................................................................................... + mls v27.4S, v5.4S, v8.S[0] // ...............................................................................*.......................... + mul v17.4S, v18.4S, v0.S[0] // ...................................................*...................................................... + mls v22.4S, v28.4S, v8.S[0] // ..............................................................*........................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sqrdmulh v16.4S, v13.4S, v26.4S // .......................................*.................................................................. + sqrdmulh v24.4S, v18.4S, v0.S[1] // ..........................*............................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + cmge v11.4S, v27.4S, v30.4S // ........................................................................................*................. + // gap // .......................................................................................................... + cmge v5.4S, v7.4S, v30.4S // .........................................................................................*................ + cmge v9.4S, v31.4S, v27.4S // .....................................................................................*.................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + cmge v12.4S, v22.4S, v30.4S // .......................................................................*.................................. + cmge v13.4S, v31.4S, v22.4S // ........................................................................*................................. + cmge v20.4S, v31.4S, v7.4S // ....................................................................................*..................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sub v9.4S, v9.4S, v11.4S // ..............................................................................................*........... + mls v21.4S, v16.4S, v8.S[0] // .............................................*............................................................ + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mul v6.4S, v4.4S, v25.4S // ...............................................*.......................................................... + sub v5.4S, v20.4S, v5.4S // ................................................................................................*......... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sub v12.4S, v13.4S, v12.4S // .................................................................................*........................ + mls v27.4S, v9.4S, v29.4S // ...................................................................................................*...... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mls v17.4S, v24.4S, v8.S[0] // ........................................................*................................................. + mls v7.4S, v5.4S, v29.4S // .....................................................................................................*.... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + cmge v14.4S, v21.4S, v30.4S // .....................................................*.................................................... + mls v22.4S, v12.4S, v29.4S // .............................................................................................*............ + // gap // .......................................................................................................... + // gap // .......................................................................................................... + str q27, [x0, #896] // ........................................................................................................*. + add v19.4S, v23.4S, v19.4S // ......................................................*................................................... + mls v6.4S, v10.4S, v8.S[0] // ....................................................*..................................................... + // gap // .......................................................................................................... + str q7, [x0], #(16) // .........................................................................................................* + // gap // .......................................................................................................... + cmge v20.4S, v31.4S, v21.4S // ..................................................*....................................................... + cmge v12.4S, v31.4S, v17.4S // .....................................................................*.................................... + mul v16.4S, v19.4S, v25.4S // .............................................................*............................................ + str q22, [x0, #624] // ....................................................................................................*..... + // gap // .......................................................................................................... + sqrdmulh v13.4S, v19.4S, v26.4S // ...........................................................*.............................................. + // gap // .......................................................................................................... + cmge v27.4S, v31.4S, v6.4S // ............................................................*............................................. + // gap // .......................................................................................................... + cmge v10.4S, v6.4S, v30.4S // ..........................................................*............................................... + // gap // .......................................................................................................... + cmge v9.4S, v17.4S, v30.4S // ..............................................................................*........................... + // gap // .......................................................................................................... + cmge v5.4S, v15.4S, v30.4S // .......................................................................................*.................. + mls v16.4S, v13.4S, v8.S[0] // ...................................................................*...................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sub v19.4S, v27.4S, v10.4S // ................................................................*......................................... + cmge v11.4S, v31.4S, v15.4S // ...................................................................................*...................... + sub v13.4S, v12.4S, v9.4S // ..................................................................................*....................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sub v27.4S, v20.4S, v14.4S // .................................................................*........................................ + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mls v6.4S, v19.4S, v29.4S // ....................................................................*..................................... + cmge v9.4S, v31.4S, v16.4S // ...........................................................................*.............................. + cmge v18.4S, v16.4S, v30.4S // .........................................................................*................................ + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sub v12.4S, v11.4S, v5.4S // ............................................................................................*............. + mls v17.4S, v13.4S, v29.4S // ..........................................................................................*............... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mls v21.4S, v27.4S, v29.4S // ...............................................................................................*.......... + sub v28.4S, v9.4S, v18.4S // ................................................................................*......................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mls v15.4S, v12.4S, v29.4S // ..................................................................................................*....... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + str q6, [x0, #240] // ..........................................................................*............................... + str q17, [x0, #496] // .................................................................................................*........ + mls v16.4S, v28.4S, v29.4S // ......................................................................................*................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + str q21, [x0, #112] // ......................................................................................................*... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + str q15, [x0, #752] // .......................................................................................................*.. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + str q16, [x0, #368] // ...........................................................................................*.............. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + + // original source code + // sqrdmulh v19.4S, v4.4S, v2.S[3] // *......................................................................................................... + // ldr q20, [x0, #128] // .*........................................................................................................ + // sub v23.4S, v22.4S, v16.4S // ..*....................................................................................................... + // ldr q4, [x0, #0] // ...*...................................................................................................... + // mls v17.4S, v21.4S, v8.S[0] // ....*..................................................................................................... + // sub v10.4S, v9.4S, v7.4S // .........*................................................................................................ + // sqrdmulh v5.4S, v23.4S, v3.S[1] // ......*................................................................................................... + // add v24.4S, v9.4S, v7.4S // .....*.................................................................................................... + // mul v15.4S, v23.4S, v3.S[0] // .......*.................................................................................................. + // mls v28.4S, v19.4S, v8.S[0] // ........*................................................................................................. + // add v19.4S, v4.4S, v20.4S // ..........*............................................................................................... + // sqrdmulh v11.4S, v10.4S, v1.S[1] // ..................*....................................................................................... + // mul v13.4S, v10.4S, v1.S[0] // ............*............................................................................................. + // sub v22.4S, v4.4S, v20.4S // ...........*.............................................................................................. + // add v4.4S, v19.4S, v6.4S // ................*......................................................................................... + // mls v15.4S, v5.4S, v8.S[0] // .............*............................................................................................ + // mul v12.4S, v22.4S, v1.S[2] // ..............*........................................................................................... + // sqrdmulh v10.4S, v22.4S, v1.S[3] // ...............*.......................................................................................... + // add v9.4S, v4.4S, v24.4S // ...........................*.............................................................................. + // sub v7.4S, v19.4S, v6.4S // .................*........................................................................................ + // sub v14.4S, v4.4S, v24.4S // ..................................................*....................................................... + // sub v6.4S, v28.4S, v15.4S // ...................*...................................................................................... + // sqrdmulh v21.4S, v7.4S, v0.S[3] // ....................*..................................................................................... + // mls v12.4S, v10.4S, v8.S[0] // .....................*.................................................................................... + // mul v18.4S, v6.4S, v1.S[0] // .........................*................................................................................ + // sqrdmulh v4.4S, v6.4S, v1.S[1] // .......................*.................................................................................. + // sqrdmulh v10.4S, v14.4S, v0.S[1] // .........................................................*................................................ + // mul v5.4S, v7.4S, v0.S[2] // ......................*................................................................................... + // sqrdmulh v23.4S, v9.4S, v26.4S // ...........................................*.............................................................. + // add v24.4S, v28.4S, v15.4S // .................................*........................................................................ + // mls v18.4S, v4.4S, v8.S[0] // ................................*......................................................................... + // add v6.4S, v12.4S, v17.4S // ............................*............................................................................. + // mls v5.4S, v21.4S, v8.S[0] // .............................*............................................................................ + // sub v4.4S, v12.4S, v17.4S // ..........................*............................................................................... + // add v19.4S, v6.4S, v24.4S // .....................................*.................................................................... + // mls v13.4S, v11.4S, v8.S[0] // ........................*................................................................................. + // sub v7.4S, v6.4S, v24.4S // ........................................*................................................................. + // mul v12.4S, v9.4S, v25.4S // .............................................*............................................................ + // mul v15.4S, v19.4S, v25.4S // ....................................................*..................................................... + // sqrdmulh v24.4S, v19.4S, v26.4S // ........................................................*................................................. + // sqrdmulh v6.4S, v4.4S, v0.S[3] // ...............................*.......................................................................... + // add v9.4S, v5.4S, v13.4S // ..................................*....................................................................... + // mul v19.4S, v7.4S, v0.S[0] // .................................................*........................................................ + // mul v28.4S, v4.4S, v0.S[2] // ..............................*........................................................................... + // sqrdmulh v4.4S, v9.4S, v26.4S // .........................................*................................................................ + // mls v15.4S, v24.4S, v8.S[0] // .................................................................*........................................ + // sub v21.4S, v5.4S, v13.4S // ...................................*...................................................................... + // mul v13.4S, v9.4S, v25.4S // ..................................................................*....................................... + // sqrdmulh v9.4S, v7.4S, v0.S[1] // ................................................*......................................................... + // mls v28.4S, v6.4S, v8.S[0] // ....................................*..................................................................... + // cmge v24.4S, v31.4S, v15.4S // ..............................................................................*........................... + // mul v14.4S, v14.4S, v0.S[0] // ......................................................*................................................... + // mls v13.4S, v4.4S, v8.S[0] // ............................................................................*............................. + // cmge v7.4S, v15.4S, v30.4S // ........................................................................*................................. + // add v11.4S, v28.4S, v18.4S // ...........................................................................*.............................. + // sub v4.4S, v28.4S, v18.4S // ..........................................*............................................................... + // mls v14.4S, v10.4S, v8.S[0] // ......................................................................*................................... + // mul v5.4S, v21.4S, v0.S[0] // ......................................*................................................................... + // cmge v28.4S, v13.4S, v30.4S // ....................................................................................*..................... + // sqrdmulh v10.4S, v11.4S, v26.4S // ..................................................................................*....................... + // cmge v17.4S, v31.4S, v13.4S // ...................................................................................*...................... + // mul v18.4S, v11.4S, v25.4S // ................................................................................*......................... + // mls v19.4S, v9.4S, v8.S[0] // .......................................................*.................................................. + // mul v9.4S, v4.4S, v0.S[0] // ...............................................*.......................................................... + // sub v28.4S, v17.4S, v28.4S // ........................................................................................*................. + // sub v24.4S, v24.4S, v7.4S // ...........................................................................................*.............. + // sqrdmulh v17.4S, v4.4S, v0.S[1] // ..............................................*........................................................... + // mls v18.4S, v10.4S, v8.S[0] // .......................................................................................*.................. + // mls v13.4S, v28.4S, v29.4S // ............................................................................................*............. + // cmge v10.4S, v31.4S, v14.4S // ...............................................................................*.......................... + // sqrdmulh v7.4S, v21.4S, v0.S[1] // .......................................*.................................................................. + // cmge v4.4S, v19.4S, v30.4S // .............................................................*............................................ + // cmge v11.4S, v31.4S, v19.4S // ..............................................................*........................................... + // cmge v28.4S, v18.4S, v30.4S // ..............................................................................................*........... + // str q13, [x0, #256] // ....................................................................................................*..... + // cmge v13.4S, v31.4S, v18.4S // .............................................................................................*............ + // mls v12.4S, v23.4S, v8.S[0] // ...................................................*...................................................... + // mls v5.4S, v7.4S, v8.S[0] // ............................................*............................................................. + // cmge v23.4S, v14.4S, v30.4S // .....................................................................................*.................... + // mls v9.4S, v17.4S, v8.S[0] // .....................................................*.................................................... + // sub v7.4S, v13.4S, v28.4S // ..................................................................................................*....... + // sub v28.4S, v11.4S, v4.4S // ....................................................................*..................................... + // sub v23.4S, v10.4S, v23.4S // ..........................................................................................*............... + // cmge v13.4S, v31.4S, v5.4S // .........................................................................................*................ + // cmge v4.4S, v31.4S, v12.4S // ...............................................................*.......................................... + // cmge v21.4S, v31.4S, v9.4S // ............................................................*............................................. + // mls v18.4S, v7.4S, v29.4S // ......................................................................................................*... + // cmge v17.4S, v5.4S, v30.4S // ......................................................................................*................... + // cmge v7.4S, v9.4S, v30.4S // ..........................................................*............................................... + // cmge v11.4S, v12.4S, v30.4S // ...........................................................*.............................................. + // mls v14.4S, v23.4S, v29.4S // ................................................................................................*......... + // str q18, [x0, #384] // .........................................................................................................* + // sub v17.4S, v13.4S, v17.4S // ...............................................................................................*.......... + // mls v19.4S, v28.4S, v29.4S // .........................................................................*................................ + // sub v28.4S, v21.4S, v7.4S // ................................................................*......................................... + // mls v15.4S, v24.4S, v29.4S // .................................................................................................*........ + // sub v21.4S, v4.4S, v11.4S // ...................................................................*...................................... + // str q14, [x0, #512] // .....................................................................................................*.... + // mls v5.4S, v17.4S, v29.4S // ...................................................................................................*...... + // mls v9.4S, v28.4S, v29.4S // .....................................................................*.................................... + // str q19, [x0, #640] // .................................................................................*........................ + // mls v12.4S, v21.4S, v29.4S // .......................................................................*.................................. + // str q15, [x0, #128] // .......................................................................................................*.. + // str q5, [x0, #768] // ........................................................................................................*. + // str q9, [x0, #896] // ..........................................................................*............................... + // str q12, [x0], #(16) // .............................................................................*............................ + + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a55.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a55.s index 7caf605..d8e53dc 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a55.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a55.s @@ -67,7 +67,7 @@ xtmp1 .req x11 cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s - vmls \a, \tmp2, modulus + vmls \a, \tmp2, consts .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -76,12 +76,6 @@ xtmp1 .req x11 mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.4s, \a\().4s, \b\().4s add \a\().4s, \a\().4s, \b\().4s @@ -393,1645 +387,1846 @@ _intt_dilithium_123_45678_opt_a55: qform_root3_tw .req q7 .p2align 2 - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [x2] // .............*.......................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - ldr q0, [x5, #144] // .........*.............................................................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sub v21.4S, v16.4S, v17.4S // .......................*................................................................ - // gap // ........................................................................................ - ldr q23, [x5, #160] // ..........*............................................................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v2.4S, v21.4S, v0.4S // .....................................*.................................................. - // gap // ........................................................................................ - sub v0.4S, v18.4S, v19.4S // ......................................*................................................. - // gap // ........................................................................................ - ldr q26, [x5, #176] // ...........*............................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v9.4S, v0.4S, v23.4S // ..........................................*............................................. - // gap // ........................................................................................ - add v7.4S, v16.4S, v17.4S // ...........................*............................................................ - // gap // ........................................................................................ - sqrdmulh v0.4S, v0.4S, v26.4S // ...........................................*............................................ - // gap // ........................................................................................ - ldr q1, [x5], #(12*16) // *....................................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - add v4.4S, v18.4S, v19.4S // .......................................*................................................ - // gap // ........................................................................................ - mls v9.4S, v0.4S, v8.S[0] // ...............................................*........................................ - // gap // ........................................................................................ - ld4 {v11.4S, v12.4S, v13.4S, v14.4S}, [x1] // ............*........................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - ldr q3, [x5, #-80] // .......*................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - ldr q10, [x5, #-144] // ...*.................................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - ldr q6, [x5, #-128] // ....*................................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sub v16.4S, v13.4S, v14.4S // ................*....................................................................... - // gap // ........................................................................................ - ldr q0, [x5, #-112] // .....*.................................................................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v17.4S, v16.4S, v6.4S // ...................*.................................................................... - // gap // ........................................................................................ - add v26.4S, v11.4S, v12.4S // ...............*........................................................................ - // gap // ........................................................................................ - sqrdmulh v16.4S, v16.4S, v0.4S // ....................*................................................................... - // gap // ........................................................................................ - add v0.4S, v13.4S, v14.4S // .....................*.................................................................. - // gap // ........................................................................................ - ldr q28, [x5, #-176] // .*...................................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v17.4S, v16.4S, v8.S[0] // ........................*............................................................... - // gap // ........................................................................................ - sub v23.4S, v26.4S, v0.4S // .........................*.............................................................. - // gap // ........................................................................................ - sub v11.4S, v11.4S, v12.4S // ..............*......................................................................... - // gap // ........................................................................................ - add v6.4S, v26.4S, v0.4S // ..........................*............................................................. - // gap // ........................................................................................ - sqrdmulh v16.4S, v23.4S, v28.4S // ..............................*......................................................... - // gap // ........................................................................................ - sqrdmulh v30.4S, v11.4S, v10.4S // ..................*..................................................................... - // gap // ........................................................................................ - ldr q25, [x5, #-160] // ..*..................................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v13.4S, v23.4S, v1.4S // .............................*.......................................................... - // gap // ........................................................................................ - sub v20.4S, v7.4S, v4.4S // ............................................*........................................... - // gap // ........................................................................................ - mul v0.4S, v11.4S, v25.4S // .................*...................................................................... - // gap // ........................................................................................ - ldr q18, [x5, #-64] // ........*............................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v26.4S, v20.4S, v3.4S // .................................................*...................................... - // gap // ........................................................................................ - mls v0.4S, v30.4S, v8.S[0] // ......................*................................................................. - // gap // ........................................................................................ - mul v11.4S, v21.4S, v18.4S // ...................................*.................................................... - // gap // ........................................................................................ - ldr q10, [x5, #-96] // ......*................................................................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sub v30.4S, v0.4S, v17.4S // ............................*........................................................... - // gap // ........................................................................................ - mls v11.4S, v2.4S, v8.S[0] // .........................................*.............................................. - // gap // ........................................................................................ - add v21.4S, v0.4S, v17.4S // .................................*...................................................... - // gap // ........................................................................................ - sqrdmulh v2.4S, v30.4S, v28.4S // ................................*....................................................... - // gap // ........................................................................................ - mul v25.4S, v30.4S, v1.4S // ...............................*........................................................ - // gap // ........................................................................................ - sub v0.4S, v11.4S, v9.4S // ...................................................*.................................... - // gap // ........................................................................................ - trn2 v17.4S, v6.4S, v21.4S // ..............................................*......................................... - // gap // ........................................................................................ - mls v13.4S, v16.4S, v8.S[0] // ..................................*..................................................... - // gap // ........................................................................................ - mls v25.4S, v2.4S, v8.S[0] // ....................................*................................................... - // gap // ........................................................................................ - sqrdmulh v16.4S, v0.4S, v3.4S // .......................................................*................................ - // gap // ........................................................................................ - mul v23.4S, v0.4S, v10.4S // ......................................................*................................. - // gap // ........................................................................................ - trn1 v30.4S, v6.4S, v21.4S // ........................................*............................................... - // gap // ........................................................................................ - trn2 v2.4S, v13.4S, v25.4S // ........................................................*............................... - // gap // ........................................................................................ - trn1 v21.4S, v13.4S, v25.4S // ..................................................*..................................... - // gap // ........................................................................................ - mul v25.4S, v20.4S, v10.4S // ................................................*....................................... - // gap // ........................................................................................ - trn2 v20.2D, v17.2D, v2.2D // ............................................................*........................... - // gap // ........................................................................................ - trn1 v0.2D, v17.2D, v2.2D // .............................................................*.......................... - // gap // ........................................................................................ - trn2 v17.2D, v30.2D, v21.2D // .........................................................*.............................. - // gap // ........................................................................................ - mls v25.4S, v26.4S, v8.S[0] // .....................................................*.................................. - // gap // ........................................................................................ - mls v23.4S, v16.4S, v8.S[0] // ...........................................................*............................ - // gap // ........................................................................................ - add v26.4S, v7.4S, v4.4S // .............................................*.......................................... - // gap // ........................................................................................ - add v4.4S, v11.4S, v9.4S // ....................................................*................................... - // gap // ........................................................................................ - trn1 v30.2D, v30.2D, v21.2D // ..........................................................*............................. - // gap // ........................................................................................ - trn2 v16.4S, v25.4S, v23.4S // .................................................................*...................... - // gap // ........................................................................................ - trn2 v2.4S, v26.4S, v4.4S // ................................................................*....................... - // gap // ........................................................................................ - trn1 v26.4S, v26.4S, v4.4S // ..............................................................*......................... - // gap // ........................................................................................ - trn1 v23.4S, v25.4S, v23.4S // ...............................................................*........................ - // gap // ........................................................................................ - trn2 v4.2D, v2.2D, v16.2D // ....................................................................*................... - // gap // ........................................................................................ - trn1 v16.2D, v2.2D, v16.2D // .....................................................................*.................. - // gap // ........................................................................................ - trn1 v21.2D, v26.2D, v23.2D // ...................................................................*.................... - // gap // ........................................................................................ - trn2 v25.2D, v26.2D, v23.2D // ..................................................................*..................... - // gap // ........................................................................................ - add v23.4S, v21.4S, v16.4S // .......................................................................*................ - // gap // ........................................................................................ - add v2.4S, v25.4S, v4.4S // ......................................................................*................. - // gap // ........................................................................................ - add v11.4S, v30.4S, v0.4S // ........................................................................*............... - // gap // ........................................................................................ - add v13.4S, v17.4S, v20.4S // .........................................................................*.............. - // gap // ........................................................................................ - add v19.4S, v23.4S, v2.4S // ...........................................................................*............ - // gap // ........................................................................................ - sub v7.4S, v23.4S, v2.4S // ..........................................................................*............. - // gap // ........................................................................................ - add v23.4S, v11.4S, v13.4S // ............................................................................*........... - // gap // ........................................................................................ - ldr q3, [x4], #64 // .............................................................................*.......... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sub v2.4S, v23.4S, v19.4S // ..............................................................................*......... - // gap // ........................................................................................ - ldr q10, [x4, #-48] // ...............................................................................*........ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v18.4S, v2.4S, v3.S[0] // ................................................................................*....... - // gap // ........................................................................................ - sqrdmulh v26.4S, v2.4S, v3.S[1] // .................................................................................*...... - // gap // ........................................................................................ - add v2.4S, v23.4S, v19.4S // ..................................................................................*..... - // gap // ........................................................................................ - ldr q1, [x4, #-32] // ...................................................................................*.... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v18.4S, v26.4S, v8.S[0] // ....................................................................................*... - // gap // ........................................................................................ - str q2, [x1], #(16*4) // .....................................................................................*.. - // gap // ........................................................................................ - ldr q6, [x4, #-16] // ......................................................................................*. - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - str q18, [x2], #(16*4) // .......................................................................................* - // gap // ........................................................................................ + ldr q0, [x5], #(12*16) // .*........................................................................................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + ldr q17, [x5, #-128] // ......*...................................................................................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + ldr q31, [x5, #-160] // ..........*.................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + ld4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x1] // *............................................................................................................................................ + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + ldr q18, [x5, #-144] // ..............*.............................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + sub v6.4S, v21.4S, v22.4S // .........*................................................................................................................................... + // gap // ............................................................................................................................................. + sub v14.4S, v19.4S, v20.4S // .............*............................................................................................................................... + // gap // ............................................................................................................................................. + ldr q11, [x5, #-112] // ...............*............................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + mul v17.4S, v6.4S, v17.4S // ............*................................................................................................................................ + // gap // ............................................................................................................................................. + mul v31.4S, v14.4S, v31.4S // .................*........................................................................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v18.4S, v14.4S, v18.4S // ................*............................................................................................................................ + // gap // ............................................................................................................................................. + sqrdmulh v6.4S, v6.4S, v11.4S // ..................*.......................................................................................................................... + // gap // ............................................................................................................................................. + add v19.4S, v19.4S, v20.4S // ...*......................................................................................................................................... + // gap // ............................................................................................................................................. + add v14.4S, v21.4S, v22.4S // ..*.......................................................................................................................................... + // gap // ............................................................................................................................................. + mls v31.4S, v18.4S, v8.S[0] // ...................*......................................................................................................................... + // gap // ............................................................................................................................................. + mls v17.4S, v6.4S, v8.S[0] // ....................*........................................................................................................................ + // gap // ............................................................................................................................................. + sub v18.4S, v19.4S, v14.4S // .....*....................................................................................................................................... + // gap // ............................................................................................................................................. + ldr q6, [x5, #-176] // ....*........................................................................................................................................ + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + sub v11.4S, v31.4S, v17.4S // .....................*....................................................................................................................... + // gap // ............................................................................................................................................. + add v19.4S, v19.4S, v14.4S // .......................................................*..................................................................................... + // gap // ............................................................................................................................................. + mul v14.4S, v18.4S, v0.4S // .......*..................................................................................................................................... + // gap // ............................................................................................................................................. + mul v0.4S, v11.4S, v0.4S // ...................................................*......................................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v18.4S, v18.4S, v6.4S // ........*.................................................................................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v6.4S, v11.4S, v6.4S // .......................*..................................................................................................................... + // gap // ............................................................................................................................................. + add v17.4S, v31.4S, v17.4S // .........................................*................................................................................................... + // gap // ............................................................................................................................................. + ldr q31, [x5, #-64] // ......................*...................................................................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + mls v14.4S, v18.4S, v8.S[0] // ...........*................................................................................................................................. + // gap // ............................................................................................................................................. + mls v0.4S, v6.4S, v8.S[0] // ........................................................*.................................................................................... + // gap // ............................................................................................................................................. + trn1 v18.4S, v19.4S, v17.4S // ..........................................................*.................................................................................. + // gap // ............................................................................................................................................. + trn2 v17.4S, v19.4S, v17.4S // ...........................................................*................................................................................. + // gap // ............................................................................................................................................. + ldr q19, [x5, #-48] // ........................*.................................................................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + trn1 v6.4S, v14.4S, v0.4S // ............................................................*................................................................................ + // gap // ............................................................................................................................................. + trn2 v0.4S, v14.4S, v0.4S // .............................................................*............................................................................... + // gap // ............................................................................................................................................. + ldr q14, [x5, #-32] // ..........................*.................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + trn2 v11.2D, v18.2D, v6.2D // ...............................................................*............................................................................. + // gap // ............................................................................................................................................. + trn2 v28.2D, v17.2D, v0.2D // ................................................................*............................................................................ + // gap // ............................................................................................................................................. + trn1 v18.2D, v18.2D, v6.2D // ............................................................................*................................................................ + // gap // ............................................................................................................................................. + sub v6.4S, v11.4S, v28.4S // ..................................................................*.......................................................................... + // gap // ............................................................................................................................................. + trn1 v0.2D, v17.2D, v0.2D // .............................................................................*............................................................... + // gap // ............................................................................................................................................. + add v17.4S, v11.4S, v28.4S // ....................................................................................................*........................................ + // gap // ............................................................................................................................................. + sub v11.4S, v18.4S, v0.4S // ................................................................................*............................................................ + // gap // ............................................................................................................................................. + add v0.4S, v18.4S, v0.4S // ...................................................................................................*......................................... + // gap // ............................................................................................................................................. + ldr q18, [x5, #-16] // ............................*................................................................................................................ + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + add v28.4S, v0.4S, v17.4S // .......................................................................................................*..................................... + // gap // ............................................................................................................................................. + sub v0.4S, v0.4S, v17.4S // .......................................................................................................................*..................... + // gap // ............................................................................................................................................. + ldr q17, [x5, #-80] // .......................................*..................................................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + srshr v26.4S, v28.4S, #23 // ..........................................................................................................*.................................. + // gap // ............................................................................................................................................. + ldr q29, [x5, #-96] // ............................................*................................................................................................ + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + mls v28.4S, v26.4S, v8.4S // .............................................................................................................*............................... + // gap // ............................................................................................................................................. + ldr q26, [x4, #16] // .....................................................*....................................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + ldr q9, [x4, #32] // ......................................................*...................................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + mul v4.4S, v11.4S, v26.S[2] // ...................................................................................*......................................................... + // gap // ............................................................................................................................................. + sqrdmulh v11.4S, v11.4S, v26.S[3] // ....................................................................................*........................................................ + // gap // ............................................................................................................................................. + mul v20.4S, v6.4S, v9.S[0] // .....................................................................*....................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v6.4S, v6.4S, v9.S[1] // .................................................................................*........................................................... + // gap // ............................................................................................................................................. + ldr q25, [x4, #48] // .......................................................................*..................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + ld4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x2] // .........................*................................................................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + mls v20.4S, v6.4S, v8.S[0] // .....................................................................................*....................................................... + // gap // ............................................................................................................................................. + mls v4.4S, v11.4S, v8.S[0] // ........................................................................................*.................................................... + // gap // ............................................................................................................................................. + sub v6.4S, v21.4S, v22.4S // .............................*............................................................................................................... + // gap // ............................................................................................................................................. + sub v11.4S, v23.4S, v24.4S // ...........................*................................................................................................................. + // gap // ............................................................................................................................................. + add v13.4S, v21.4S, v22.4S // ..................................*.......................................................................................................... + // gap // ............................................................................................................................................. + mul v31.4S, v6.4S, v31.4S // .................................*........................................................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v19.4S, v6.4S, v19.4S // ................................*............................................................................................................ + // gap // ............................................................................................................................................. + mul v6.4S, v11.4S, v14.4S // ..............................*.............................................................................................................. + // gap // ............................................................................................................................................. + sqrdmulh v18.4S, v11.4S, v18.4S // ...............................*............................................................................................................. + // gap // ............................................................................................................................................. + add v14.4S, v23.4S, v24.4S // ...................................*......................................................................................................... + // gap // ............................................................................................................................................. + mls v31.4S, v19.4S, v8.S[0] // .....................................*....................................................................................................... + // gap // ............................................................................................................................................. + add v27.4S, v4.4S, v20.4S // ............................................................................................*................................................ + // gap // ............................................................................................................................................. + mls v6.4S, v18.4S, v8.S[0] // ....................................*........................................................................................................ + // gap // ............................................................................................................................................. + sub v19.4S, v13.4S, v14.4S // ......................................*...................................................................................................... + // gap // ............................................................................................................................................. + add v18.4S, v13.4S, v14.4S // .............................................*............................................................................................... + // gap // ............................................................................................................................................. + srshr v14.4S, v27.4S, #23 // ...............................................................................................*............................................. + // gap // ............................................................................................................................................. + sub v11.4S, v31.4S, v6.4S // ........................................*.................................................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v13.4S, v19.4S, v17.4S // ..........................................*.................................................................................................. + // gap // ............................................................................................................................................. + add v31.4S, v31.4S, v6.4S // ..............................................*.............................................................................................. + // gap // ............................................................................................................................................. + sqrdmulh v17.4S, v11.4S, v17.4S // ...........................................*................................................................................................. + // gap // ............................................................................................................................................. + mul v6.4S, v11.4S, v29.4S // ...............................................*............................................................................................. + // gap // ............................................................................................................................................. + mul v19.4S, v19.4S, v29.4S // ................................................*............................................................................................ + // gap // ............................................................................................................................................. + trn2 v11.4S, v18.4S, v31.4S // .................................................*........................................................................................... + // gap // ............................................................................................................................................. + trn1 v31.4S, v18.4S, v31.4S // ..................................................*.......................................................................................... + // gap // ............................................................................................................................................. + mls v6.4S, v17.4S, v8.S[0] // .........................................................*................................................................................... + // gap // ............................................................................................................................................. + mls v19.4S, v13.4S, v8.S[0] // ....................................................*........................................................................................ + // gap // ............................................................................................................................................. + mls v27.4S, v14.4S, v8.4S // ..................................................................................................*.......................................... + // gap // ............................................................................................................................................. + sub v17.4S, v4.4S, v20.4S // ..............................................................................................................*.............................. + // gap // ............................................................................................................................................. + ldr q29, [x4], #64 // ...............................................................................................................*............................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + trn1 v18.4S, v19.4S, v6.4S // ..............................................................*.............................................................................. + // gap // ............................................................................................................................................. + trn2 v19.4S, v19.4S, v6.4S // .................................................................*........................................................................... + // gap // ............................................................................................................................................. + mul v6.4S, v17.4S, v29.S[2] // ..................................................................................................................*.......................... + // gap // ............................................................................................................................................. + trn2 v14.2D, v31.2D, v18.2D // ...................................................................*......................................................................... + // gap // ............................................................................................................................................. + trn2 v4.2D, v11.2D, v19.2D // ....................................................................*........................................................................ + // gap // ............................................................................................................................................. + trn1 v19.2D, v11.2D, v19.2D // ........................................................................*.................................................................... + // gap // ............................................................................................................................................. + sub v11.4S, v14.4S, v4.4S // ......................................................................*...................................................................... + // gap // ............................................................................................................................................. + mul v20.4S, v0.4S, v29.S[2] // .............................................................................................................................*............... + // gap // ............................................................................................................................................. + sqrdmulh v0.4S, v0.4S, v29.S[3] // ..............................................................................................................................*.............. + // gap // ............................................................................................................................................. + trn1 v31.2D, v31.2D, v18.2D // .........................................................................*................................................................... + // gap // ............................................................................................................................................. + mul v18.4S, v11.4S, v25.S[0] // ..........................................................................*.................................................................. + // gap // ............................................................................................................................................. + sub v13.4S, v31.4S, v19.4S // ...........................................................................*................................................................. + // gap // ............................................................................................................................................. + sqrdmulh v11.4S, v11.4S, v25.S[1] // ..................................................................................*.......................................................... + // gap // ............................................................................................................................................. + add v31.4S, v31.4S, v19.4S // .........................................................................................*................................................... + // gap // ............................................................................................................................................. + mul v19.4S, v13.4S, v9.S[2] // ..............................................................................*.............................................................. + // gap // ............................................................................................................................................. + sqrdmulh v9.4S, v13.4S, v9.S[3] // ...............................................................................*............................................................. + // gap // ............................................................................................................................................. + mls v18.4S, v11.4S, v8.S[0] // .......................................................................................*..................................................... + // gap // ............................................................................................................................................. + add v14.4S, v14.4S, v4.4S // ..........................................................................................*.................................................. + // gap // ............................................................................................................................................. + sqrdmulh v17.4S, v17.4S, v29.S[3] // ....................................................................................................................*........................ + // gap // ............................................................................................................................................. + mls v19.4S, v9.4S, v8.S[0] // ......................................................................................*...................................................... + // gap // ............................................................................................................................................. + add v11.4S, v31.4S, v14.4S // ................................................................................................*............................................ + // gap // ............................................................................................................................................. + sub v31.4S, v31.4S, v14.4S // .............................................................................................*............................................... + // gap // ............................................................................................................................................. + mls v6.4S, v17.4S, v8.S[0] // ........................................................................................................................*.................... + // gap // ............................................................................................................................................. + srshr v17.4S, v11.4S, #23 // ........................................................................................................*.................................... + // gap // ............................................................................................................................................. + sub v14.4S, v19.4S, v18.4S // .....................................................................................................*....................................... + // gap // ............................................................................................................................................. + sqrdmulh v9.4S, v31.4S, v26.S[1] // ...........................................................................................................................*................. + // gap // ............................................................................................................................................. + mls v11.4S, v17.4S, v8.4S // ............................................................................................................*................................ + // gap // ............................................................................................................................................. + mul v17.4S, v14.4S, v26.S[0] // ...........................................................................................................*................................. + // gap // ............................................................................................................................................. + sqrdmulh v14.4S, v14.4S, v26.S[1] // .................................................................................................................*........................... + // gap // ............................................................................................................................................. + mul v31.4S, v31.4S, v26.S[0] // .................................................................................................................................*........... + // gap // ............................................................................................................................................. + add v26.4S, v28.4S, v11.4S // ................................................................................................................*............................ + // gap // ............................................................................................................................................. + sub v11.4S, v28.4S, v11.4S // ......................................................................................................................*...................... + // gap // ............................................................................................................................................. + add v4.4S, v19.4S, v18.4S // ...........................................................................................*................................................. + // gap // ............................................................................................................................................. + str q26, [x1], #(16*4) // ...................................................................................................................*......................... + // gap // ............................................................................................................................................. + mls v17.4S, v14.4S, v8.S[0] // .....................................................................................................................*....................... + // gap // ............................................................................................................................................. + srshr v19.4S, v4.4S, #23 // ..............................................................................................*.............................................. + // gap // ............................................................................................................................................. + sqrdmulh v18.4S, v11.4S, v29.S[1] // .........................................................................................................................*................... + // gap // ............................................................................................................................................. + mul v14.4S, v11.4S, v29.S[0] // ..........................................................................................................................*.................. + // gap // ............................................................................................................................................. + sub v11.4S, v6.4S, v17.4S // ............................................................................................................................*................ + // gap // ............................................................................................................................................. + mls v4.4S, v19.4S, v8.4S // .................................................................................................*........................................... + // gap // ............................................................................................................................................. + add v17.4S, v6.4S, v17.4S // .......................................................................................................................................*..... + // gap // ............................................................................................................................................. + sqrdmulh v19.4S, v11.4S, v29.S[1] // ................................................................................................................................*............ + // gap // ............................................................................................................................................. + mul v6.4S, v11.4S, v29.S[0] // ...............................................................................................................................*............. + // gap // ............................................................................................................................................. + mls v14.4S, v18.4S, v8.S[0] // ..................................................................................................................................*.......... + // gap // ............................................................................................................................................. + mls v20.4S, v0.4S, v8.S[0] // ...................................................................................................................................*......... + // gap // ............................................................................................................................................. + str q17, [x1, #-16] // ..........................................................................................................................................*.. + // gap // ............................................................................................................................................. + mls v6.4S, v19.4S, v8.S[0] // ....................................................................................................................................*........ + // gap // ............................................................................................................................................. + mls v31.4S, v9.4S, v8.S[0] // .....................................................................................................................................*....... + // gap // ............................................................................................................................................. + str q14, [x2], #(16*4) // ......................................................................................................................................*...... + // gap // ............................................................................................................................................. + add v0.4S, v27.4S, v4.4S // ......................................................................................................*...................................... + // gap // ............................................................................................................................................. + str q6, [x2, #-16] // ........................................................................................................................................*.... + // gap // ............................................................................................................................................. + add v17.4S, v20.4S, v31.4S // .........................................................................................................................................*... + // gap // ............................................................................................................................................. + str q0, [x1, #-48] // .........................................................................................................*................................... + // gap // ............................................................................................................................................. + sub v10.4S, v20.4S, v31.4S // ...........................................................................................................................................*. + // gap // ............................................................................................................................................. + str q17, [x1, #-32] // ............................................................................................................................................* + // gap // ............................................................................................................................................. // original source code - // ldr q2, [x5], #(12*16) // ..........*............................................................................. - // ldr q21, [x5, #-176] // .......................*................................................................ - // ldr q0, [x5, #-160] // ..............................*......................................................... - // ldr q16, [x5, #-144] // ...............*........................................................................ - // ldr q23, [x5, #-128] // ................*....................................................................... - // ldr q26, [x5, #-112] // ..................*..................................................................... - // ldr q20, [x5, #-96] // ......................................*................................................. - // ldr q17, [x5, #-80] // ..............*......................................................................... - // ldr q30, [x5, #-64] // ..................................*..................................................... - // ldr q25, [x5, #-48] // .*...................................................................................... - // ldr q11, [x5, #-32] // ...*.................................................................................... - // ldr q7, [x5, #-16] // ......*................................................................................. - // ld4 {v12.4S, v13.4S, v14.4S, v15.4S}, [x1] // .............*.......................................................................... - // ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x2] // *....................................................................................... - // sub v10.4S, v12.4S, v13.4S // ..........................*............................................................. - // add v13.4S, v12.4S, v13.4S // ....................*................................................................... - // sub v19.4S, v14.4S, v15.4S // .................*...................................................................... - // mul v0.4S, v10.4S, v0.4S // .................................*...................................................... - // sqrdmulh v16.4S, v10.4S, v16.4S // .............................*.......................................................... - // mul v23.4S, v19.4S, v23.4S // ...................*.................................................................... - // sqrdmulh v26.4S, v19.4S, v26.4S // .....................*.................................................................. - // add v10.4S, v14.4S, v15.4S // ......................*................................................................. - // mls v0.4S, v16.4S, v8.S[0] // ....................................*................................................... - // sub v16.4S, v3.4S, v4.4S // ..*..................................................................................... - // mls v23.4S, v26.4S, v8.S[0] // ........................*............................................................... - // sub v26.4S, v13.4S, v10.4S // .........................*.............................................................. - // add v13.4S, v13.4S, v10.4S // ...........................*............................................................ - // add v4.4S, v3.4S, v4.4S // ........*............................................................................... - // sub v3.4S, v0.4S, v23.4S // .......................................*................................................ - // mul v10.4S, v26.4S, v2.4S // ...............................*........................................................ - // sqrdmulh v26.4S, v26.4S, v21.4S // ............................*........................................................... - // mul v2.4S, v3.4S, v2.4S // ...........................................*............................................ - // sqrdmulh v21.4S, v3.4S, v21.4S // ..........................................*............................................. - // add v0.4S, v0.4S, v23.4S // .........................................*.............................................. - // mls v10.4S, v26.4S, v8.S[0] // ..............................................*......................................... - // mul v23.4S, v16.4S, v30.4S // .....................................*.................................................. - // mls v2.4S, v21.4S, v8.S[0] // ...............................................*........................................ - // sqrdmulh v16.4S, v16.4S, v25.4S // ....*................................................................................... - // sub v21.4S, v5.4S, v6.4S // .....*.................................................................................. - // add v26.4S, v5.4S, v6.4S // ...........*............................................................................ - // trn1 v30.4S, v13.4S, v0.4S // ..................................................*..................................... - // mls v23.4S, v16.4S, v8.S[0] // ........................................*............................................... - // mul v16.4S, v21.4S, v11.4S // .......*................................................................................ - // sqrdmulh v21.4S, v21.4S, v7.4S // .........*.............................................................................. - // sub v25.4S, v4.4S, v26.4S // ................................*....................................................... - // add v26.4S, v4.4S, v26.4S // ...........................................................*............................ - // trn2 v0.4S, v13.4S, v0.4S // .............................................*.......................................... - // mls v16.4S, v21.4S, v8.S[0] // ............*........................................................................... - // mul v21.4S, v25.4S, v20.4S // .....................................................*.................................. - // sqrdmulh v4.4S, v25.4S, v17.4S // ...................................*.................................................... - // trn1 v25.4S, v10.4S, v2.4S // ....................................................*................................... - // sub v11.4S, v23.4S, v16.4S // ............................................*........................................... - // add v16.4S, v23.4S, v16.4S // ............................................................*........................... - // mls v21.4S, v4.4S, v8.S[0] // .........................................................*.............................. - // mul v23.4S, v11.4S, v20.4S // .................................................*...................................... - // sqrdmulh v20.4S, v11.4S, v17.4S // ................................................*....................................... - // trn2 v2.4S, v10.4S, v2.4S // ...................................................*.................................... - // trn2 v17.2D, v30.2D, v25.2D // ........................................................*............................... - // trn1 v30.2D, v30.2D, v25.2D // .............................................................*.......................... - // mls v23.4S, v20.4S, v8.S[0] // ..........................................................*............................. - // trn2 v20.2D, v0.2D, v2.2D // ......................................................*................................. - // trn1 v0.2D, v0.2D, v2.2D // .......................................................*................................ - // trn1 v2.4S, v26.4S, v16.4S // ................................................................*....................... - // trn1 v4.4S, v21.4S, v23.4S // .................................................................*...................... - // trn2 v16.4S, v26.4S, v16.4S // ...............................................................*........................ - // trn2 v23.4S, v21.4S, v23.4S // ..............................................................*......................... - // trn2 v25.2D, v2.2D, v4.2D // .....................................................................*.................. - // trn1 v21.2D, v2.2D, v4.2D // ....................................................................*................... - // trn2 v4.2D, v16.2D, v23.2D // ..................................................................*..................... - // trn1 v16.2D, v16.2D, v23.2D // ...................................................................*.................... - // add v2.4S, v25.4S, v4.4S // .......................................................................*................ - // add v23.4S, v21.4S, v16.4S // ......................................................................*................. - // add v11.4S, v30.4S, v0.4S // ........................................................................*............... - // add v13.4S, v17.4S, v20.4S // .........................................................................*.............. - // sub v7.4S, v23.4S, v2.4S // ...........................................................................*............ - // add v2.4S, v23.4S, v2.4S // ..........................................................................*............. - // add v23.4S, v11.4S, v13.4S // ............................................................................*........... - // ldr q3, [x4], #64 // .............................................................................*.......... - // sub v26.4S, v23.4S, v2.4S // ..............................................................................*......... - // ldr q10, [x4, #-48] // ...............................................................................*........ - // mul v19.4S, v26.4S, v3.S[0] // ................................................................................*....... - // sqrdmulh v26.4S, v26.4S, v3.S[1] // .................................................................................*...... - // add v2.4S, v23.4S, v2.4S // ..................................................................................*..... - // ldr q1, [x4, #-32] // ...................................................................................*.... - // mls v19.4S, v26.4S, v8.S[0] // ....................................................................................*... - // str q2, [x1], #(16*4) // .....................................................................................*.. - // ldr q6, [x4, #-16] // ......................................................................................*. - // str q19, [x2], #(16*4) // .......................................................................................* + // ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [x1] // ...*......................................................................................................................................... + // ldr q2, [x5], #(12*16) // *............................................................................................................................................ + // add v0.4S, v18.4S, v19.4S // .............*............................................................................................................................... + // add v21.4S, v16.4S, v17.4S // ............*................................................................................................................................ + // ldr q27, [x5, #-176] // .................*........................................................................................................................... + // sub v24.4S, v21.4S, v0.4S // ................*............................................................................................................................ + // ldr q25, [x5, #-128] // .*........................................................................................................................................... + // mul v28.4S, v24.4S, v2.4S // ....................*........................................................................................................................ + // sqrdmulh v23.4S, v24.4S, v27.4S // ......................*...................................................................................................................... + // sub v12.4S, v18.4S, v19.4S // .....*....................................................................................................................................... + // ldr q3, [x5, #-160] // ..*.......................................................................................................................................... + // mls v28.4S, v23.4S, v8.S[0] // ..........................*.................................................................................................................. + // mul v1.4S, v12.4S, v25.4S // ........*.................................................................................................................................... + // sub v13.4S, v16.4S, v17.4S // ......*...................................................................................................................................... + // ldr q22, [x5, #-144] // ....*........................................................................................................................................ + // ldr q5, [x5, #-112] // .......*..................................................................................................................................... + // sqrdmulh v20.4S, v13.4S, v22.4S // ..........*.................................................................................................................................. + // mul v11.4S, v13.4S, v3.4S // .........*................................................................................................................................... + // sqrdmulh v30.4S, v12.4S, v5.4S // ...........*................................................................................................................................. + // mls v11.4S, v20.4S, v8.S[0] // ..............*.............................................................................................................................. + // mls v1.4S, v30.4S, v8.S[0] // ...............*............................................................................................................................. + // sub v31.4S, v11.4S, v1.4S // ..................*.......................................................................................................................... + // ldr q12, [x5, #-64] // .........................*................................................................................................................... + // sqrdmulh v24.4S, v31.4S, v27.4S // .......................*..................................................................................................................... + // ldr q4, [x5, #-48] // ..............................*.............................................................................................................. + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // ........................................................*.................................................................................... + // ldr q17, [x5, #-32] // .................................*........................................................................................................... + // sub v20.4S, v15.4S, v16.4S // ............................................................*................................................................................ + // ldr q5, [x5, #-16] // ..........................................*.................................................................................................. + // sub v30.4S, v13.4S, v14.4S // ...........................................................*................................................................................. + // mul v3.4S, v20.4S, v17.4S // ................................................................*............................................................................ + // sqrdmulh v27.4S, v20.4S, v5.4S // .................................................................*........................................................................... + // sqrdmulh v29.4S, v30.4S, v4.4S // ...............................................................*............................................................................. + // mul v12.4S, v30.4S, v12.4S // ..............................................................*.............................................................................. + // add v14.4S, v13.4S, v14.4S // .............................................................*............................................................................... + // add v6.4S, v15.4S, v16.4S // ..................................................................*.......................................................................... + // mls v3.4S, v27.4S, v8.S[0] // .....................................................................*....................................................................... + // mls v12.4S, v29.4S, v8.S[0] // ...................................................................*......................................................................... + // sub v20.4S, v14.4S, v6.4S // ......................................................................*...................................................................... + // ldr q26, [x5, #-80] // .............................................*............................................................................................... + // sub v13.4S, v12.4S, v3.4S // .........................................................................*................................................................... + // add v22.4S, v11.4S, v1.4S // ........................*.................................................................................................................... + // sqrdmulh v4.4S, v20.4S, v26.4S // ..........................................................................*.................................................................. + // sqrdmulh v7.4S, v13.4S, v26.4S // ............................................................................*................................................................ + // ldr q19, [x5, #-96] // ...............................................*............................................................................................. + // add v25.4S, v14.4S, v6.4S // .......................................................................*..................................................................... + // add v1.4S, v12.4S, v3.4S // ...........................................................................*................................................................. + // mul v5.4S, v13.4S, v19.4S // .............................................................................*............................................................... + // mul v29.4S, v20.4S, v19.4S // ..............................................................................*.............................................................. + // trn2 v18.4S, v25.4S, v1.4S // ...............................................................................*............................................................. + // trn1 v12.4S, v25.4S, v1.4S // ................................................................................*............................................................ + // mul v30.4S, v31.4S, v2.4S // .....................*....................................................................................................................... + // mls v29.4S, v4.4S, v8.S[0] // ..................................................................................*.......................................................... + // ldr q10, [x4, #16] // .................................................*........................................................................................... + // ldr q15, [x4, #32] // ..................................................*.......................................................................................... + // add v27.4S, v21.4S, v0.4S // ...................*......................................................................................................................... + // mls v30.4S, v24.4S, v8.S[0] // ...........................*................................................................................................................. + // mls v5.4S, v7.4S, v8.S[0] // .................................................................................*........................................................... + // trn1 v17.4S, v27.4S, v22.4S // ............................*................................................................................................................ + // trn2 v2.4S, v27.4S, v22.4S // .............................*............................................................................................................... + // trn1 v11.4S, v28.4S, v30.4S // ...............................*............................................................................................................. + // trn2 v26.4S, v28.4S, v30.4S // ................................*............................................................................................................ + // trn1 v4.4S, v29.4S, v5.4S // ......................................................................................*...................................................... + // trn2 v7.2D, v17.2D, v11.2D // ..................................*.......................................................................................................... + // trn2 v3.2D, v2.2D, v26.2D // ...................................*......................................................................................................... + // trn2 v0.4S, v29.4S, v5.4S // .......................................................................................*..................................................... + // sub v20.4S, v7.4S, v3.4S // .....................................*....................................................................................................... + // trn2 v14.2D, v12.2D, v4.2D // .........................................................................................*................................................... + // trn2 v27.2D, v18.2D, v0.2D // ..........................................................................................*.................................................. + // mul v31.4S, v20.4S, v15.S[0] // .....................................................*....................................................................................... + // sub v1.4S, v14.4S, v27.4S // ............................................................................................*................................................ + // ldr q23, [x4, #48] // .......................................................*..................................................................................... + // trn1 v19.2D, v18.2D, v0.2D // ...........................................................................................*................................................. + // trn1 v25.2D, v12.2D, v4.2D // ...............................................................................................*............................................. + // mul v29.4S, v1.4S, v23.S[0] // ................................................................................................*............................................ + // sub v5.4S, v25.4S, v19.4S // .................................................................................................*........................................... + // trn1 v6.2D, v17.2D, v11.2D // ....................................*........................................................................................................ + // trn1 v9.2D, v2.2D, v26.2D // ......................................*...................................................................................................... + // mul v24.4S, v5.4S, v15.S[2] // ....................................................................................................*........................................ + // sqrdmulh v11.4S, v5.4S, v15.S[3] // .....................................................................................................*....................................... + // sub v16.4S, v6.4S, v9.4S // ........................................*.................................................................................................... + // sqrdmulh v13.4S, v20.4S, v15.S[1] // ......................................................*...................................................................................... + // sqrdmulh v30.4S, v1.4S, v23.S[1] // ..................................................................................................*.......................................... + // mul v22.4S, v16.4S, v10.S[2] // ...................................................*......................................................................................... + // sqrdmulh v17.4S, v16.4S, v10.S[3] // ....................................................*........................................................................................ + // mls v31.4S, v13.4S, v8.S[0] // .........................................................*................................................................................... + // mls v24.4S, v11.4S, v8.S[0] // .........................................................................................................*................................... + // mls v29.4S, v30.4S, v8.S[0] // ......................................................................................................*...................................... + // mls v22.4S, v17.4S, v8.S[0] // ..........................................................*.................................................................................. + // add v17.4S, v25.4S, v19.4S // ...................................................................................................*......................................... + // add v18.4S, v14.4S, v27.4S // .......................................................................................................*..................................... + // add v4.4S, v24.4S, v29.4S // ......................................................................................................................*...................... + // add v27.4S, v22.4S, v31.4S // ....................................................................*........................................................................ + // sub v5.4S, v17.4S, v18.4S // ...........................................................................................................*................................. + // srshr v25.4S, v4.4S, #23 // .........................................................................................................................*................... + // srshr v26.4S, v27.4S, #23 // ........................................................................*.................................................................... + // add v21.4S, v17.4S, v18.4S // ..........................................................................................................*.................................. + // mls v4.4S, v25.4S, v8.4S // .............................................................................................................................*............... + // mls v27.4S, v26.4S, v8.4S // ...................................................................................*......................................................... + // add v28.4S, v6.4S, v9.4S // .........................................*................................................................................................... + // add v23.4S, v7.4S, v3.4S // .......................................*..................................................................................................... + // sub v13.4S, v24.4S, v29.4S // ..............................................................................................................*.............................. + // add v18.4S, v27.4S, v4.4S // .......................................................................................................................................*..... + // add v2.4S, v28.4S, v23.4S // ...........................................*................................................................................................. + // srshr v26.4S, v21.4S, #23 // .............................................................................................................*............................... + // str q18, [x1, #16] // ..........................................................................................................................................*.. + // srshr v12.4S, v2.4S, #23 // ..............................................*.............................................................................................. + // mul v19.4S, v13.4S, v10.S[0] // .................................................................................................................*........................... + // mls v21.4S, v26.4S, v8.4S // ................................................................................................................*............................ + // mls v2.4S, v12.4S, v8.4S // ................................................*............................................................................................ + // sub v6.4S, v22.4S, v31.4S // ....................................................................................*........................................................ + // ldr q29, [x4], #64 // .....................................................................................*....................................................... + // add v14.4S, v2.4S, v21.4S // ....................................................................................................................*........................ + // sqrdmulh v11.4S, v13.4S, v10.S[1] // ..................................................................................................................*.......................... + // mul v9.4S, v6.4S, v29.S[2] // ........................................................................................*.................................................... + // str q14, [x1], #(16*4) // .......................................................................................................................*..................... + // sqrdmulh v1.4S, v6.4S, v29.S[3] // ........................................................................................................*.................................... + // mls v19.4S, v11.4S, v8.S[0] // ........................................................................................................................*.................... + // sub v7.4S, v2.4S, v21.4S // .....................................................................................................................*....................... + // sub v14.4S, v28.4S, v23.4S // ............................................*................................................................................................ + // mls v9.4S, v1.4S, v8.S[0] // ............................................................................................................*................................ + // sqrdmulh v1.4S, v7.4S, v29.S[1] // ..........................................................................................................................*.................. + // mul v23.4S, v7.4S, v29.S[0] // ...........................................................................................................................*................. + // sqrdmulh v0.4S, v5.4S, v10.S[1] // ...............................................................................................................*............................. + // sub v15.4S, v9.4S, v19.4S // ............................................................................................................................*................ + // mul v12.4S, v14.4S, v29.S[2] // .............................................................................................*............................................... + // sqrdmulh v17.4S, v14.4S, v29.S[3] // ..............................................................................................*.............................................. + // mul v30.4S, v15.4S, v29.S[0] // ................................................................................................................................*............ + // sqrdmulh v31.4S, v15.4S, v29.S[1] // ...............................................................................................................................*............. + // mul v16.4S, v5.4S, v10.S[0] // ...................................................................................................................*......................... + // mls v23.4S, v1.4S, v8.S[0] // .................................................................................................................................*........... + // mls v12.4S, v17.4S, v8.S[0] // ..................................................................................................................................*.......... + // mls v30.4S, v31.4S, v8.S[0] // ....................................................................................................................................*........ + // mls v16.4S, v0.4S, v8.S[0] // .....................................................................................................................................*....... + // str q23, [x2], #(16*4) // ......................................................................................................................................*...... + // add v24.4S, v9.4S, v19.4S // ..............................................................................................................................*.............. + // str q30, [x2, #-16] // ........................................................................................................................................*.... + // add v22.4S, v12.4S, v16.4S // .........................................................................................................................................*... + // str q24, [x1, #-16] // ...................................................................................................................................*......... + // sub v10.4S, v12.4S, v16.4S // ...........................................................................................................................................*. + // str q22, [x1, #-32] // ............................................................................................................................................* sub count, count, #1 layer45678_start: - sub v0.4S, v30.4S, v0.4S // ..........................................................................*..................................................................... - // gap // ................................................................................................................................................ - sub v2.4S, v17.4S, v20.4S // ...............................................................................*................................................................ - // gap // ................................................................................................................................................ - sub v16.4S, v21.4S, v16.4S // ....................................................................................*........................................................... - // gap // ................................................................................................................................................ - mul v23.4S, v0.4S, v10.S[2] // ............................................................................*................................................................... - // gap // ................................................................................................................................................ - sqrdmulh v0.4S, v0.4S, v10.S[3] // .............................................................................*.................................................................. - // gap // ................................................................................................................................................ - mul v21.4S, v2.4S, v1.S[0] // .................................................................................*.............................................................. - // gap // ................................................................................................................................................ - sqrdmulh v2.4S, v2.4S, v1.S[1] // ..................................................................................*............................................................. - // gap // ................................................................................................................................................ - mul v26.4S, v16.4S, v1.S[2] // ......................................................................................*......................................................... - // gap // ................................................................................................................................................ - sqrdmulh v16.4S, v16.4S, v1.S[3] // .......................................................................................*........................................................ - // gap // ................................................................................................................................................ - mls v23.4S, v0.4S, v8.S[0] // ..............................................................................*................................................................. - // gap // ................................................................................................................................................ - mls v21.4S, v2.4S, v8.S[0] // ...................................................................................*............................................................ - // gap // ................................................................................................................................................ - sub v0.4S, v25.4S, v4.4S // .........................................................................................*...................................................... - // gap // ................................................................................................................................................ - mls v26.4S, v16.4S, v8.S[0] // ........................................................................................*....................................................... - // gap // ................................................................................................................................................ - sub v2.4S, v11.4S, v13.4S // ..............................................................................................*................................................. - // gap // ................................................................................................................................................ - mul v16.4S, v0.4S, v6.S[0] // ...........................................................................................*.................................................... - // gap // ................................................................................................................................................ - sqrdmulh v0.4S, v0.4S, v6.S[1] // ............................................................................................*................................................... - // gap // ................................................................................................................................................ - mul v20.4S, v2.4S, v3.S[2] // ................................................................................................*............................................... - // gap // ................................................................................................................................................ - sqrdmulh v2.4S, v2.4S, v3.S[3] // .................................................................................................*.............................................. - // gap // ................................................................................................................................................ - sub v17.4S, v23.4S, v21.4S // ...................................................................................................*............................................ - // gap // ................................................................................................................................................ - add v23.4S, v23.4S, v21.4S // ....................................................................................................*........................................... - // gap // ................................................................................................................................................ - mls v16.4S, v0.4S, v8.S[0] // .............................................................................................*.................................................. - // gap // ................................................................................................................................................ - mls v20.4S, v2.4S, v8.S[0] // ..................................................................................................*............................................. - // gap // ................................................................................................................................................ - mul v0.4S, v17.4S, v3.S[2] // .....................................................................................................*.......................................... - // gap // ................................................................................................................................................ - sqrdmulh v2.4S, v17.4S, v3.S[3] // ......................................................................................................*......................................... - // gap // ................................................................................................................................................ - mul v21.4S, v7.4S, v10.S[0] // ..........................................................................................................*..................................... - // gap // ................................................................................................................................................ - sub v17.4S, v26.4S, v16.4S // .............................................................................................................*.................................. - // gap // ................................................................................................................................................ - add v16.4S, v26.4S, v16.4S // ..............................................................................................................*................................. - // gap // ................................................................................................................................................ - mls v0.4S, v2.4S, v8.S[0] // .......................................................................................................*........................................ - // gap // ................................................................................................................................................ - sqrdmulh v2.4S, v7.4S, v10.S[1] // ...........................................................................................................*.................................... - // gap // ................................................................................................................................................ - mul v26.4S, v17.4S, v10.S[0] // ...............................................................................................................*................................ - // gap // ................................................................................................................................................ - sqrdmulh v17.4S, v17.4S, v10.S[1] // ................................................................................................................*............................... - // gap // ................................................................................................................................................ - sub v30.4S, v23.4S, v16.4S // .......................................................................................................................*........................ - // gap // ................................................................................................................................................ - mls v21.4S, v2.4S, v8.S[0] // ............................................................................................................*................................... - // gap // ................................................................................................................................................ - add v2.4S, v23.4S, v16.4S // ........................................................................................................................*....................... - // gap // ................................................................................................................................................ - mls v26.4S, v17.4S, v8.S[0] // .................................................................................................................*.............................. - // gap // ................................................................................................................................................ - mul v16.4S, v30.4S, v3.S[0] // .........................................................................................................................*...................... - // gap // ................................................................................................................................................ - sqrdmulh v23.4S, v30.4S, v3.S[1] // ..........................................................................................................................*..................... - // gap // ................................................................................................................................................ - sub v17.4S, v20.4S, v21.4S // ............................................................................................................................*................... - // gap // ................................................................................................................................................ - add v21.4S, v20.4S, v21.4S // .............................................................................................................................*.................. - // gap // ................................................................................................................................................ - sub v20.4S, v0.4S, v26.4S // .................................................................................................................................*.............. - // gap // ................................................................................................................................................ - mls v16.4S, v23.4S, v8.S[0] // ...........................................................................................................................*.................... - // gap // ................................................................................................................................................ - mul v23.4S, v17.4S, v3.S[0] // ..............................................................................................................................*................. - // gap // ................................................................................................................................................ - sqrdmulh v17.4S, v17.4S, v3.S[1] // ...............................................................................................................................*................ - // gap // ................................................................................................................................................ - add v0.4S, v0.4S, v26.4S // ..................................................................................................................................*............. - // gap // ................................................................................................................................................ - mul v26.4S, v20.4S, v3.S[0] // ...................................................................................................................................*............ - // gap // ................................................................................................................................................ - sqrdmulh v20.4S, v20.4S, v3.S[1] // ....................................................................................................................................*........... - // gap // ................................................................................................................................................ - mls v23.4S, v17.4S, v8.S[0] // ................................................................................................................................*............... - // gap // ................................................................................................................................................ - str q2, [x1, #-48] // .......................................................................................................................................*........ - // gap // ................................................................................................................................................ - ldr q2, [x5], #(12*16) // ..e............................................................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v26.4S, v20.4S, v8.S[0] // .....................................................................................................................................*.......... - // gap // ................................................................................................................................................ - str q21, [x1, #-32] // ........................................................................................................................................*....... - // gap // ................................................................................................................................................ - ldr q21, [x5, #-176] // ...e............................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q0, [x1, #-16] // .........................................................................................................................................*...... - add x1, x1, #64 // ..............................................................................................................................................*. - ldr q0, [x5, #-160] // ....e........................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q16, [x2, #-48] // ...........................................................................................................................................*.... - // gap // ................................................................................................................................................ - ldr q16, [x5, #-144] // .....e.......................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q23, [x2, #-32] // ............................................................................................................................................*... - // gap // ................................................................................................................................................ - ldr q23, [x5, #-128] // ......e......................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q26, [x2, #-16] // .............................................................................................................................................*.. - add x2, x2, #64 // ...............................................................................................................................................* - ldr q26, [x5, #-112] // .......e........................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr q20, [x5, #-96] // ............................e................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr q17, [x5, #-80] // .............................e.................................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr q30, [x5, #-64] // ..............................e................................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr q25, [x5, #-48] // ...............................e................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr q11, [x5, #-32] // ................................e............................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr q7, [x5, #-16] // .................................e.............................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ld4 {v12.4S, v13.4S, v14.4S, v15.4S}, [x1] // e............................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x2] // .e.............................................................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v10.4S, v12.4S, v13.4S // ........e....................................................................................................................................... - // gap // ................................................................................................................................................ - add v13.4S, v12.4S, v13.4S // .........e...................................................................................................................................... - // gap // ................................................................................................................................................ - sub v19.4S, v14.4S, v15.4S // .............e.................................................................................................................................. - // gap // ................................................................................................................................................ - mul v0.4S, v10.4S, v0.4S // ..........e..................................................................................................................................... - // gap // ................................................................................................................................................ - sqrdmulh v16.4S, v10.4S, v16.4S // ...........e.................................................................................................................................... - // gap // ................................................................................................................................................ - mul v23.4S, v19.4S, v23.4S // ...............e................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v26.4S, v19.4S, v26.4S // ................e............................................................................................................................... - // gap // ................................................................................................................................................ - add v10.4S, v14.4S, v15.4S // ..............e................................................................................................................................. - // gap // ................................................................................................................................................ - mls v0.4S, v16.4S, v8.S[0] // ............e................................................................................................................................... - // gap // ................................................................................................................................................ - sub v16.4S, v3.4S, v4.4S // ..................................e............................................................................................................. - // gap // ................................................................................................................................................ - mls v23.4S, v26.4S, v8.S[0] // .................e.............................................................................................................................. - // gap // ................................................................................................................................................ - sub v26.4S, v13.4S, v10.4S // ..................e............................................................................................................................. - // gap // ................................................................................................................................................ - add v13.4S, v13.4S, v10.4S // ...................e............................................................................................................................ - // gap // ................................................................................................................................................ - add v4.4S, v3.4S, v4.4S // ...................................e............................................................................................................ - // gap // ................................................................................................................................................ - sub v3.4S, v0.4S, v23.4S // .......................e........................................................................................................................ - // gap // ................................................................................................................................................ - mul v10.4S, v26.4S, v2.4S // ....................e........................................................................................................................... - // gap // ................................................................................................................................................ - sqrdmulh v26.4S, v26.4S, v21.4S // .....................e.......................................................................................................................... - // gap // ................................................................................................................................................ - mul v2.4S, v3.4S, v2.4S // .........................e...................................................................................................................... - // gap // ................................................................................................................................................ - sqrdmulh v21.4S, v3.4S, v21.4S // ..........................e..................................................................................................................... - // gap // ................................................................................................................................................ - add v0.4S, v0.4S, v23.4S // ........................e....................................................................................................................... - // gap // ................................................................................................................................................ - mls v10.4S, v26.4S, v8.S[0] // ......................e......................................................................................................................... - // gap // ................................................................................................................................................ - mul v23.4S, v16.4S, v30.4S // ....................................e........................................................................................................... - // gap // ................................................................................................................................................ - mls v2.4S, v21.4S, v8.S[0] // ...........................e.................................................................................................................... - // gap // ................................................................................................................................................ - sqrdmulh v16.4S, v16.4S, v25.4S // .....................................e.......................................................................................................... - // gap // ................................................................................................................................................ - sub v21.4S, v5.4S, v6.4S // .......................................e........................................................................................................ - // gap // ................................................................................................................................................ - add v26.4S, v5.4S, v6.4S // ........................................e....................................................................................................... - // gap // ................................................................................................................................................ - trn1 v30.4S, v13.4S, v0.4S // ......................................................e......................................................................................... - // gap // ................................................................................................................................................ - mls v23.4S, v16.4S, v8.S[0] // ......................................e......................................................................................................... - // gap // ................................................................................................................................................ - mul v16.4S, v21.4S, v11.4S // .........................................e...................................................................................................... - // gap // ................................................................................................................................................ - sqrdmulh v21.4S, v21.4S, v7.4S // ..........................................e..................................................................................................... - // gap // ................................................................................................................................................ - sub v25.4S, v4.4S, v26.4S // ............................................e................................................................................................... - // gap // ................................................................................................................................................ - add v26.4S, v4.4S, v26.4S // .............................................e.................................................................................................. - // gap // ................................................................................................................................................ - trn2 v0.4S, v13.4S, v0.4S // .......................................................e........................................................................................ - // gap // ................................................................................................................................................ - mls v16.4S, v21.4S, v8.S[0] // ...........................................e.................................................................................................... - // gap // ................................................................................................................................................ - mul v21.4S, v25.4S, v20.4S // ..............................................e................................................................................................. - // gap // ................................................................................................................................................ - sqrdmulh v4.4S, v25.4S, v17.4S // ...............................................e................................................................................................ - // gap // ................................................................................................................................................ - trn1 v25.4S, v10.4S, v2.4S // ........................................................e....................................................................................... - // gap // ................................................................................................................................................ - sub v11.4S, v23.4S, v16.4S // .................................................e.............................................................................................. - // gap // ................................................................................................................................................ - add v16.4S, v23.4S, v16.4S // ..................................................e............................................................................................. - // gap // ................................................................................................................................................ - mls v21.4S, v4.4S, v8.S[0] // ................................................e............................................................................................... - // gap // ................................................................................................................................................ - mul v23.4S, v11.4S, v20.4S // ...................................................e............................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v20.4S, v11.4S, v17.4S // ....................................................e........................................................................................... - // gap // ................................................................................................................................................ - trn2 v2.4S, v10.4S, v2.4S // .........................................................e...................................................................................... - // gap // ................................................................................................................................................ - trn2 v17.2D, v30.2D, v25.2D // ..........................................................e..................................................................................... - // gap // ................................................................................................................................................ - trn1 v30.2D, v30.2D, v25.2D // ............................................................e................................................................................... - // gap // ................................................................................................................................................ - mls v23.4S, v20.4S, v8.S[0] // .....................................................e.......................................................................................... - // gap // ................................................................................................................................................ - trn2 v20.2D, v0.2D, v2.2D // ...........................................................e.................................................................................... - // gap // ................................................................................................................................................ - trn1 v0.2D, v0.2D, v2.2D // .............................................................e.................................................................................. - // gap // ................................................................................................................................................ - trn1 v2.4S, v26.4S, v16.4S // ..............................................................e................................................................................. - // gap // ................................................................................................................................................ - trn1 v4.4S, v21.4S, v23.4S // ................................................................e............................................................................... - // gap // ................................................................................................................................................ - trn2 v16.4S, v26.4S, v16.4S // ...............................................................e................................................................................ - // gap // ................................................................................................................................................ - trn2 v23.4S, v21.4S, v23.4S // .................................................................e.............................................................................. - // gap // ................................................................................................................................................ - trn2 v25.2D, v2.2D, v4.2D // ..................................................................e............................................................................. - // gap // ................................................................................................................................................ - trn1 v21.2D, v2.2D, v4.2D // ....................................................................e........................................................................... - // gap // ................................................................................................................................................ - trn2 v4.2D, v16.2D, v23.2D // ...................................................................e............................................................................ - // gap // ................................................................................................................................................ - trn1 v16.2D, v16.2D, v23.2D // .....................................................................e.......................................................................... - // gap // ................................................................................................................................................ - add v2.4S, v25.4S, v4.4S // ..........................................................................................e..................................................... - // gap // ................................................................................................................................................ - add v23.4S, v21.4S, v16.4S // .....................................................................................e.......................................................... - // gap // ................................................................................................................................................ - add v11.4S, v30.4S, v0.4S // ...........................................................................e.................................................................... - // gap // ................................................................................................................................................ - add v13.4S, v17.4S, v20.4S // ................................................................................e............................................................... - // gap // ................................................................................................................................................ - sub v7.4S, v23.4S, v2.4S // ........................................................................................................e....................................... - // gap // ................................................................................................................................................ - add v2.4S, v23.4S, v2.4S // .........................................................................................................e...................................... - // gap // ................................................................................................................................................ - add v23.4S, v11.4S, v13.4S // ...............................................................................................e................................................ - // gap // ................................................................................................................................................ - ldr q3, [x4], #64 // ......................................................................e......................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v26.4S, v23.4S, v2.4S // ..................................................................................................................e............................. - // gap // ................................................................................................................................................ - ldr q10, [x4, #-48] // .......................................................................e........................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v19.4S, v26.4S, v3.S[0] // ....................................................................................................................e........................... - // gap // ................................................................................................................................................ - sqrdmulh v26.4S, v26.4S, v3.S[1] // .....................................................................................................................e.......................... - // gap // ................................................................................................................................................ - add v2.4S, v23.4S, v2.4S // ...................................................................................................................e............................ - // gap // ................................................................................................................................................ - ldr q1, [x4, #-32] // ........................................................................e....................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v19.4S, v26.4S, v8.S[0] // ......................................................................................................................e......................... - // gap // ................................................................................................................................................ - str q2, [x1], #(16*4) // ......................................................................................................................................e......... - // gap // ................................................................................................................................................ - ldr q6, [x4, #-16] // .........................................................................e...................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q19, [x2], #(16*4) // ..........................................................................................................................................e..... - // gap // ................................................................................................................................................ + sub v6.4S, v27.4S, v4.4S // ...............................................................................................................................*........................ + add x1, x1, #64 // ......................................................................................................................................................*. + ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [x1] // e....................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q2, [x5], #(12*16) // ..e..................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v0.4S, v18.4S, v19.4S // ..............e......................................................................................................................................... + // gap // ........................................................................................................................................................ + add v21.4S, v16.4S, v17.4S // .........e.............................................................................................................................................. + // gap // ........................................................................................................................................................ + ldr q27, [x5, #-176] // ...e.................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v24.4S, v21.4S, v0.4S // ..................e..................................................................................................................................... + // gap // ........................................................................................................................................................ + ldr q25, [x5, #-128] // ......e................................................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v28.4S, v24.4S, v2.4S // ....................e................................................................................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v23.4S, v24.4S, v27.4S // .....................e.................................................................................................................................. + // gap // ........................................................................................................................................................ + sub v12.4S, v18.4S, v19.4S // .............e.......................................................................................................................................... + // gap // ........................................................................................................................................................ + ldr q3, [x5, #-160] // ....e................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v28.4S, v23.4S, v8.S[0] // ......................e................................................................................................................................. + // gap // ........................................................................................................................................................ + mul v1.4S, v12.4S, v25.4S // ...............e........................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v13.4S, v16.4S, v17.4S // ........e............................................................................................................................................... + // gap // ........................................................................................................................................................ + ldr q22, [x5, #-144] // .....e.................................................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q5, [x5, #-112] // .......e................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v20.4S, v13.4S, v22.4S // ...........e............................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v11.4S, v13.4S, v3.4S // ..........e............................................................................................................................................. + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v12.4S, v5.4S // ................e....................................................................................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v12.4S, v6.4S, v29.S[1] // ..................................................................................................................................*..................... + // gap // ........................................................................................................................................................ + mul v6.4S, v6.4S, v29.S[0] // .................................................................................................................................*...................... + // gap // ........................................................................................................................................................ + mls v11.4S, v20.4S, v8.S[0] // ............e........................................................................................................................................... + // gap // ........................................................................................................................................................ + mls v1.4S, v30.4S, v8.S[0] // .................e...................................................................................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v9.4S, v10.4S, v29.S[1] // .......................................................................................................................................*................ + // gap // ........................................................................................................................................................ + mls v6.4S, v12.4S, v8.S[0] // ...................................................................................................................................*.................... + // gap // ........................................................................................................................................................ + mul v13.4S, v10.4S, v29.S[0] // ......................................................................................................................................*................. + // gap // ........................................................................................................................................................ + sub v31.4S, v11.4S, v1.4S // .......................e................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q12, [x5, #-64] // ..............................e......................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v24.4S, v31.4S, v27.4S // ..........................e............................................................................................................................. + // gap // ........................................................................................................................................................ + mls v13.4S, v9.4S, v8.S[0] // ........................................................................................................................................*............... + // gap // ........................................................................................................................................................ + str q6, [x2, #-48] // ...................................................................................................................................................*.... + // gap // ........................................................................................................................................................ + ldr q4, [x5, #-48] // ...............................e........................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q13, [x2, #-32] // ....................................................................................................................................................*... + add x2, x2, #64 // .......................................................................................................................................................* + ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // .e...................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q17, [x5, #-32] // ................................e....................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v20.4S, v15.4S, v16.4S // .......................................e................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q5, [x5, #-16] // .................................e...................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v30.4S, v13.4S, v14.4S // ..................................e..................................................................................................................... + // gap // ........................................................................................................................................................ + mul v3.4S, v20.4S, v17.4S // .........................................e.............................................................................................................. + // gap // ........................................................................................................................................................ + sqrdmulh v27.4S, v20.4S, v5.4S // ..........................................e............................................................................................................. + // gap // ........................................................................................................................................................ + sqrdmulh v29.4S, v30.4S, v4.4S // .....................................e.................................................................................................................. + // gap // ........................................................................................................................................................ + mul v12.4S, v30.4S, v12.4S // ....................................e................................................................................................................... + // gap // ........................................................................................................................................................ + add v14.4S, v13.4S, v14.4S // ...................................e.................................................................................................................... + // gap // ........................................................................................................................................................ + add v6.4S, v15.4S, v16.4S // ........................................e............................................................................................................... + // gap // ........................................................................................................................................................ + mls v3.4S, v27.4S, v8.S[0] // ...........................................e............................................................................................................ + // gap // ........................................................................................................................................................ + mls v12.4S, v29.4S, v8.S[0] // ......................................e................................................................................................................. + // gap // ........................................................................................................................................................ + sub v20.4S, v14.4S, v6.4S // ............................................e........................................................................................................... + // gap // ........................................................................................................................................................ + ldr q26, [x5, #-80] // .............................e.......................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v13.4S, v12.4S, v3.4S // .................................................e...................................................................................................... + // gap // ........................................................................................................................................................ + add v22.4S, v11.4S, v1.4S // ........................e............................................................................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v4.4S, v20.4S, v26.4S // ...............................................e........................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v7.4S, v13.4S, v26.4S // ....................................................e................................................................................................... + // gap // ........................................................................................................................................................ + ldr q19, [x5, #-96] // ............................e........................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v25.4S, v14.4S, v6.4S // .............................................e.......................................................................................................... + // gap // ........................................................................................................................................................ + add v1.4S, v12.4S, v3.4S // ..................................................e..................................................................................................... + // gap // ........................................................................................................................................................ + mul v5.4S, v13.4S, v19.4S // ...................................................e.................................................................................................... + // gap // ........................................................................................................................................................ + mul v29.4S, v20.4S, v19.4S // ..............................................e......................................................................................................... + // gap // ........................................................................................................................................................ + trn2 v18.4S, v25.4S, v1.4S // ...............................................................e........................................................................................ + // gap // ........................................................................................................................................................ + trn1 v12.4S, v25.4S, v1.4S // ..............................................................e......................................................................................... + // gap // ........................................................................................................................................................ + mul v30.4S, v31.4S, v2.4S // .........................e.............................................................................................................................. + // gap // ........................................................................................................................................................ + mls v29.4S, v4.4S, v8.S[0] // ................................................e....................................................................................................... + // gap // ........................................................................................................................................................ + ldr q10, [x4, #16] // .......................................................................e................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q15, [x4, #32] // ........................................................................e............................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v27.4S, v21.4S, v0.4S // ...................e.................................................................................................................................... + // gap // ........................................................................................................................................................ + mls v30.4S, v24.4S, v8.S[0] // ...........................e............................................................................................................................ + // gap // ........................................................................................................................................................ + mls v5.4S, v7.4S, v8.S[0] // .....................................................e.................................................................................................. + // gap // ........................................................................................................................................................ + trn1 v17.4S, v27.4S, v22.4S // ......................................................e................................................................................................. + // gap // ........................................................................................................................................................ + trn2 v2.4S, v27.4S, v22.4S // .......................................................e................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v11.4S, v28.4S, v30.4S // ........................................................e............................................................................................... + // gap // ........................................................................................................................................................ + trn2 v26.4S, v28.4S, v30.4S // .........................................................e.............................................................................................. + // gap // ........................................................................................................................................................ + trn1 v4.4S, v29.4S, v5.4S // ................................................................e....................................................................................... + // gap // ........................................................................................................................................................ + trn2 v7.2D, v17.2D, v11.2D // ..........................................................e............................................................................................. + // gap // ........................................................................................................................................................ + trn2 v3.2D, v2.2D, v26.2D // ...........................................................e............................................................................................ + // gap // ........................................................................................................................................................ + trn2 v0.4S, v29.4S, v5.4S // .................................................................e...................................................................................... + // gap // ........................................................................................................................................................ + sub v20.4S, v7.4S, v3.4S // ...............................................................................e........................................................................ + // gap // ........................................................................................................................................................ + trn2 v14.2D, v12.2D, v4.2D // ..................................................................e..................................................................................... + // gap // ........................................................................................................................................................ + trn2 v27.2D, v18.2D, v0.2D // ...................................................................e.................................................................................... + // gap // ........................................................................................................................................................ + mul v31.4S, v20.4S, v15.S[0] // .................................................................................e...................................................................... + // gap // ........................................................................................................................................................ + sub v1.4S, v14.4S, v27.4S // .........................................................................................e.............................................................. + // gap // ........................................................................................................................................................ + ldr q23, [x4, #48] // .........................................................................e.............................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v19.2D, v18.2D, v0.2D // .....................................................................e.................................................................................. + // gap // ........................................................................................................................................................ + trn1 v25.2D, v12.2D, v4.2D // ....................................................................e................................................................................... + // gap // ........................................................................................................................................................ + mul v29.4S, v1.4S, v23.S[0] // ...........................................................................................e............................................................ + // gap // ........................................................................................................................................................ + sub v5.4S, v25.4S, v19.4S // ....................................................................................e................................................................... + // gap // ........................................................................................................................................................ + trn1 v6.2D, v17.2D, v11.2D // ............................................................e........................................................................................... + // gap // ........................................................................................................................................................ + trn1 v9.2D, v2.2D, v26.2D // .............................................................e.......................................................................................... + // gap // ........................................................................................................................................................ + mul v24.4S, v5.4S, v15.S[2] // ......................................................................................e................................................................. + // gap // ........................................................................................................................................................ + sqrdmulh v11.4S, v5.4S, v15.S[3] // .......................................................................................e................................................................ + // gap // ........................................................................................................................................................ + sub v16.4S, v6.4S, v9.4S // ..........................................................................e............................................................................. + // gap // ........................................................................................................................................................ + sqrdmulh v13.4S, v20.4S, v15.S[1] // ..................................................................................e..................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v1.4S, v23.S[1] // ............................................................................................e........................................................... + // gap // ........................................................................................................................................................ + mul v22.4S, v16.4S, v10.S[2] // ............................................................................e........................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v17.4S, v16.4S, v10.S[3] // .............................................................................e.......................................................................... + // gap // ........................................................................................................................................................ + mls v31.4S, v13.4S, v8.S[0] // ...................................................................................e.................................................................... + // gap // ........................................................................................................................................................ + mls v24.4S, v11.4S, v8.S[0] // ........................................................................................e............................................................... + // gap // ........................................................................................................................................................ + mls v29.4S, v30.4S, v8.S[0] // .............................................................................................e.......................................................... + // gap // ........................................................................................................................................................ + mls v22.4S, v17.4S, v8.S[0] // ..............................................................................e......................................................................... + // gap // ........................................................................................................................................................ + add v17.4S, v25.4S, v19.4S // .....................................................................................e.................................................................. + // gap // ........................................................................................................................................................ + add v18.4S, v14.4S, v27.4S // ..........................................................................................e............................................................. + // gap // ........................................................................................................................................................ + add v4.4S, v24.4S, v29.4S // ..............................................................................................................e......................................... + // gap // ........................................................................................................................................................ + add v27.4S, v22.4S, v31.4S // ....................................................................................................e................................................... + // gap // ........................................................................................................................................................ + sub v5.4S, v17.4S, v18.4S // ........................................................................................................e............................................... + // gap // ........................................................................................................................................................ + srshr v25.4S, v4.4S, #23 // ........................................................................................................................e............................... + // gap // ........................................................................................................................................................ + srshr v26.4S, v27.4S, #23 // ....................................................................................................................e................................... + // gap // ........................................................................................................................................................ + add v21.4S, v17.4S, v18.4S // .........................................................................................................e.............................................. + // gap // ........................................................................................................................................................ + mls v4.4S, v25.4S, v8.4S // .........................................................................................................................e.............................. + // gap // ........................................................................................................................................................ + mls v27.4S, v26.4S, v8.4S // .....................................................................................................................e.................................. + // gap // ........................................................................................................................................................ + add v28.4S, v6.4S, v9.4S // ...........................................................................e............................................................................ + // gap // ........................................................................................................................................................ + add v23.4S, v7.4S, v3.4S // ................................................................................e....................................................................... + // gap // ........................................................................................................................................................ + sub v13.4S, v24.4S, v29.4S // .............................................................................................................e.......................................... + // gap // ........................................................................................................................................................ + add v18.4S, v27.4S, v4.4S // ................................................................................................................................e....................... + // gap // ........................................................................................................................................................ + add v2.4S, v28.4S, v23.4S // ...............................................................................................e........................................................ + // gap // ........................................................................................................................................................ + srshr v26.4S, v21.4S, #23 // ......................................................................................................................e................................. + // gap // ........................................................................................................................................................ + str q18, [x1, #16] // ...............................................................................................................................................e........ + // gap // ........................................................................................................................................................ + srshr v12.4S, v2.4S, #23 // ..................................................................................................................e..................................... + // gap // ........................................................................................................................................................ + mul v19.4S, v13.4S, v10.S[0] // ...............................................................................................................e........................................ + // gap // ........................................................................................................................................................ + mls v21.4S, v26.4S, v8.4S // .......................................................................................................................e................................ + // gap // ........................................................................................................................................................ + mls v2.4S, v12.4S, v8.4S // ...................................................................................................................e.................................... + // gap // ........................................................................................................................................................ + sub v6.4S, v22.4S, v31.4S // ...................................................................................................e.................................................... + // gap // ........................................................................................................................................................ + ldr q29, [x4], #64 // ......................................................................e................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v14.4S, v2.4S, v21.4S // ...........................................................................................................................e............................ + // gap // ........................................................................................................................................................ + sqrdmulh v11.4S, v13.4S, v10.S[1] // ................................................................................................................e....................................... + // gap // ........................................................................................................................................................ + mul v9.4S, v6.4S, v29.S[2] // .....................................................................................................e.................................................. + // gap // ........................................................................................................................................................ + str q14, [x1], #(16*4) // ..............................................................................................................................................e......... + // gap // ........................................................................................................................................................ + sqrdmulh v1.4S, v6.4S, v29.S[3] // ......................................................................................................e................................................. + // gap // ........................................................................................................................................................ + mls v19.4S, v11.4S, v8.S[0] // .................................................................................................................e...................................... + // gap // ........................................................................................................................................................ + sub v7.4S, v2.4S, v21.4S // ..........................................................................................................................e............................. + // gap // ........................................................................................................................................................ + sub v14.4S, v28.4S, v23.4S // ..............................................................................................e......................................................... + // gap // ........................................................................................................................................................ + mls v9.4S, v1.4S, v8.S[0] // .......................................................................................................e................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v1.4S, v7.4S, v29.S[1] // .............................................................................................................................e.......................... + // gap // ........................................................................................................................................................ + mul v23.4S, v7.4S, v29.S[0] // ............................................................................................................................e........................... + // gap // ........................................................................................................................................................ + sqrdmulh v0.4S, v5.4S, v10.S[1] // ...........................................................................................................e............................................ + // gap // ........................................................................................................................................................ + sub v15.4S, v9.4S, v19.4S // .........................................................................................................................................e.............. + // gap // ........................................................................................................................................................ + mul v12.4S, v14.4S, v29.S[2] // ................................................................................................e....................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v17.4S, v14.4S, v29.S[3] // .................................................................................................e...................................................... + // gap // ........................................................................................................................................................ + mul v30.4S, v15.4S, v29.S[0] // ...........................................................................................................................................e............ + // gap // ........................................................................................................................................................ + sqrdmulh v31.4S, v15.4S, v29.S[1] // ............................................................................................................................................e........... + // gap // ........................................................................................................................................................ + mul v16.4S, v5.4S, v10.S[0] // ..........................................................................................................e............................................. + // gap // ........................................................................................................................................................ + mls v23.4S, v1.4S, v8.S[0] // ..............................................................................................................................e......................... + // gap // ........................................................................................................................................................ + mls v12.4S, v17.4S, v8.S[0] // ..................................................................................................e..................................................... + // gap // ........................................................................................................................................................ + mls v30.4S, v31.4S, v8.S[0] // .............................................................................................................................................e.......... + // gap // ........................................................................................................................................................ + mls v16.4S, v0.4S, v8.S[0] // ............................................................................................................e........................................... + // gap // ........................................................................................................................................................ + str q23, [x2], #(16*4) // ..................................................................................................................................................e..... + // gap // ........................................................................................................................................................ + add v24.4S, v9.4S, v19.4S // ..........................................................................................................................................e............. + // gap // ........................................................................................................................................................ + str q30, [x2, #-16] // .....................................................................................................................................................e.. + // gap // ........................................................................................................................................................ + add v22.4S, v12.4S, v16.4S // .....................................................................................................................................e.................. + // gap // ........................................................................................................................................................ + str q24, [x1, #-16] // .................................................................................................................................................e...... + // gap // ........................................................................................................................................................ + sub v10.4S, v12.4S, v16.4S // ....................................................................................................................................e................... + // gap // ........................................................................................................................................................ + str q22, [x1, #-32] // ................................................................................................................................................e....... + // gap // ........................................................................................................................................................ // original source code - // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // ....................e...........................................................................|............................................................ - // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // .....................e..........................................................................|............................................................ - // ldr q0, [x5], #(12*16) // e...............................................................................................|...............................................e............ - // ldr q4, [x5, #(-12*16 + 1*16)] // ...e............................................................................................|..................................................e......... - // ldr q1, [x5, #(-12*16 + 2*16)] // ......e.........................................................................................|.....................................................e...... - // ldr q5, [x5, #(-12*16 + 3*16)] // ........e.......................................................................................|.......................................................e.... - // ldr q2, [x5, #(-12*16 + 4*16)] // ..........e.....................................................................................|.........................................................e.. - // ldr q6, [x5, #(-12*16 + 5*16)] // .............e..................................................................................|............................................................ - // sub v24.4s, v9.4s, v10.4s // ......................e.........................................................................|............................................................ - // add v9.4s, v9.4s, v10.4s // .......................e........................................................................|............................................................ - // mul v10.4s, v24.4s, v1.4s // .........................e......................................................................|............................................................ - // sqrdmulh v24.4s, v24.4s, v5.4s // ..........................e.....................................................................|............................................................ - // mls v10.4s, v24.4s, v8.s[0] // ..............................e.................................................................|............................................................ - // sub v24.4s, v11.4s, v12.4s // ........................e.......................................................................|............................................................ - // add v11.4s, v11.4s, v12.4s // .............................e..................................................................|............................................................ - // mul v12.4s, v24.4s, v2.4s // ...........................e....................................................................|............................................................ - // sqrdmulh v24.4s, v24.4s, v6.4s // ............................e...................................................................|............................................................ - // mls v12.4s, v24.4s, v8.s[0] // ................................e...............................................................|............................................................ - // sub v24.4s, v9.4s, v11.4s // .................................e..............................................................|............................................................ - // add v9.4s, v9.4s, v11.4s // ..................................e.............................................................|............................................................ - // mul v11.4s, v24.4s, v0.4s // .....................................e..........................................................|............................................................ - // sqrdmulh v24.4s, v24.4s, v4.4s // ......................................e.........................................................|............................................................ - // mls v11.4s, v24.4s, v8.s[0] // ..........................................e.....................................................|............................................................ - // sub v24.4s, v10.4s, v12.4s // ....................................e...........................................................|............................................................ - // add v10.4s, v10.4s, v12.4s // .........................................e......................................................|............................................................ - // mul v12.4s, v24.4s, v0.4s // .......................................e........................................................|............................................................ - // sqrdmulh v24.4s, v24.4s, v4.4s // ........................................e.......................................................|............................................................ - // mls v12.4s, v24.4s, v8.s[0] // ............................................e...................................................|............................................................ - // ldr q0, [x5, #(-12*16 + 6*16)] // ..............e.................................................................................|............................................................ - // ldr q4, [x5, #(-12*16 + 7*16)] // ...............e................................................................................|............................................................ - // ldr q1, [x5, #(-12*16 + 8*16)] // ................e...............................................................................|............................................................ - // ldr q5, [x5, #(-12*16 + 9*16)] // .................e..............................................................................|............................................................ - // ldr q2, [x5, #(-12*16 + 10*16)] // ..................e.............................................................................|............................................................ - // ldr q6, [x5, #(-12*16 + 11*16)] // ...................e............................................................................|............................................................ - // sub v24.4s, v13.4s, v14.4s // ...............................e................................................................|............................................................ - // add v13.4s, v13.4s, v14.4s // ...................................e............................................................|............................................................ - // mul v14.4s, v24.4s, v1.4s // ...........................................e....................................................|............................................................ - // sqrdmulh v24.4s, v24.4s, v5.4s // .............................................e..................................................|............................................................ - // mls v14.4s, v24.4s, v8.s[0] // .................................................e..............................................|............................................................ - // sub v24.4s, v15.4s, v16.4s // ..............................................e.................................................|............................................................ - // add v15.4s, v15.4s, v16.4s // ...............................................e................................................|............................................................ - // mul v16.4s, v24.4s, v2.4s // ..................................................e.............................................|............................................................ - // sqrdmulh v24.4s, v24.4s, v6.4s // ...................................................e............................................|............................................................ - // mls v16.4s, v24.4s, v8.s[0] // .......................................................e........................................|............................................................ - // sub v24.4s, v13.4s, v15.4s // ....................................................e...........................................|............................................................ - // add v13.4s, v13.4s, v15.4s // .....................................................e..........................................|............................................................ - // mul v15.4s, v24.4s, v0.4s // ........................................................e.......................................|............................................................ - // sqrdmulh v24.4s, v24.4s, v4.4s // .........................................................e......................................|............................................................ - // mls v15.4s, v24.4s, v8.s[0] // .............................................................e..................................|............................................................ - // sub v24.4s, v14.4s, v16.4s // ...........................................................e....................................|............................................................ - // add v14.4s, v14.4s, v16.4s // ............................................................e...................................|............................................................ - // mul v16.4s, v24.4s, v0.4s // ..............................................................e.................................|............................................................ - // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................................................e................................|............................................................ - // mls v16.4s, v24.4s, v8.s[0] // ...................................................................e............................|............................................................ - // trn1 v25.4s, v9.4s, v10.4s // ................................................e...............................................|............................................................ - // trn2 v26.4s, v9.4s, v10.4s // ......................................................e.........................................|............................................................ - // trn1 v27.4s, v11.4s, v12.4s // ..........................................................e.....................................|............................................................ - // trn2 v28.4s, v11.4s, v12.4s // ................................................................e...............................|............................................................ - // trn2 v11.2d, v25.2d, v27.2d // .................................................................e..............................|............................................................ - // trn2 v12.2d, v26.2d, v28.2d // ....................................................................e...........................|............................................................ - // trn1 v9.2d, v25.2d, v27.2d // ..................................................................e.............................|............................................................ - // trn1 v10.2d, v26.2d, v28.2d // .....................................................................e..........................|............................................................ - // trn1 v25.4s, v13.4s, v14.4s // ......................................................................e.........................|............................................................ - // trn2 v26.4s, v13.4s, v14.4s // ........................................................................e.......................|............................................................ - // trn1 v27.4s, v15.4s, v16.4s // .......................................................................e........................|............................................................ - // trn2 v28.4s, v15.4s, v16.4s // .........................................................................e......................|............................................................ - // trn2 v15.2d, v25.2d, v27.2d // ..........................................................................e.....................|............................................................ - // trn2 v16.2d, v26.2d, v28.2d // ............................................................................e...................|............................................................ - // trn1 v13.2d, v25.2d, v27.2d // ...........................................................................e....................|............................................................ - // trn1 v14.2d, v26.2d, v28.2d // .............................................................................e..................|............................................................ - // ldr q0, [x4], #64 // .....................................................................................e..........|............................................................ - // ldr q1, [x4, #(-64 + 16)] // .......................................................................................e........|............................................................ - // ldr q2, [x4, #(-64 + 32)] // ...........................................................................................e....|............................................................ - // ldr q3, [x4, #(-64 + 48)] // ..............................................................................................e.|............................................................ - // sub v24.4s, v9.4s, v10.4s // ................................................................................................*............................................................ - // add v9.4s, v9.4s, v10.4s // ................................................................................e...............|............................................................ - // mul v10.4s, v24.4s, v1.s[2] // ................................................................................................|..*......................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ................................................................................................|...*........................................................ - // mls v10.4s, v24.4s, v8.s[0] // ................................................................................................|........*................................................... - // sub v24.4s, v11.4s, v12.4s // ................................................................................................|*........................................................... - // add v11.4s, v11.4s, v12.4s // .................................................................................e..............|............................................................ - // mul v12.4s, v24.4s, v2.s[0] // ................................................................................................|....*....................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................................................................................................|.....*...................................................... - // mls v12.4s, v24.4s, v8.s[0] // ................................................................................................|.........*.................................................. - // sub v24.4s, v13.4s, v14.4s // ................................................................................................|.*.......................................................... - // add v13.4s, v13.4s, v14.4s // ...............................................................................e................|............................................................ - // mul v14.4s, v24.4s, v2.s[2] // ................................................................................................|......*..................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................................................................................................|.......*.................................................... - // mls v14.4s, v24.4s, v8.s[0] // ................................................................................................|...........*................................................ - // sub v24.4s, v15.4s, v16.4s // ................................................................................................|..........*................................................. - // add v15.4s, v15.4s, v16.4s // ..............................................................................e.................|............................................................ - // mul v16.4s, v24.4s, v3.s[0] // ................................................................................................|.............*.............................................. - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................................................................................................|..............*............................................. - // mls v16.4s, v24.4s, v8.s[0] // ................................................................................................|...................*........................................ - // sub v24.4s, v9.4s, v11.4s // ................................................................................................|............*............................................... - // add v9.4s, v9.4s, v11.4s // ....................................................................................e...........|............................................................ - // mul v11.4s, v24.4s, v0.s[2] // ................................................................................................|...............*............................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................................................................................................|................*........................................... - // mls v11.4s, v24.4s, v8.s[0] // ................................................................................................|....................*....................................... - // sub v24.4s, v10.4s, v12.4s // ................................................................................................|.................*.......................................... - // add v10.4s, v10.4s, v12.4s // ................................................................................................|..................*......................................... - // mul v12.4s, v24.4s, v0.s[2] // ................................................................................................|.....................*...................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................................................................................................|......................*..................................... - // mls v12.4s, v24.4s, v8.s[0] // ................................................................................................|..........................*................................. - // sub v24.4s, v13.4s, v15.4s // ..................................................................................e.............|............................................................ - // add v13.4s, v13.4s, v15.4s // ...................................................................................e............|............................................................ - // mul v15.4s, v24.4s, v1.s[0] // ................................................................................................|.......................*.................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................................................................................................|...........................*................................ - // mls v15.4s, v24.4s, v8.s[0] // ................................................................................................|...............................*............................ - // sub v24.4s, v14.4s, v16.4s // ................................................................................................|........................*................................... - // add v14.4s, v14.4s, v16.4s // ................................................................................................|.........................*.................................. - // mul v16.4s, v24.4s, v1.s[0] // ................................................................................................|............................*............................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................................................................................................|.............................*.............................. - // mls v16.4s, v24.4s, v8.s[0] // ................................................................................................|.................................*.......................... - // sub v24.4s, v9.4s, v13.4s // ......................................................................................e.........|............................................................ - // add v9.4s, v9.4s, v13.4s // ..........................................................................................e.....|............................................................ - // mul v13.4s, v24.4s, v0.s[0] // ........................................................................................e.......|............................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................................................................................e......|............................................................ - // mls v13.4s, v24.4s, v8.s[0] // ............................................................................................e...|............................................................ - // sub v24.4s, v10.4s, v14.4s // ................................................................................................|..............................*............................. - // add v10.4s, v10.4s, v14.4s // ................................................................................................|................................*........................... - // mul v14.4s, v24.4s, v0.s[0] // ................................................................................................|..................................*......................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................................................................|...................................*........................ - // mls v14.4s, v24.4s, v8.s[0] // ................................................................................................|.......................................*.................... - // sub v24.4s, v11.4s, v15.4s // ................................................................................................|....................................*....................... - // add v11.4s, v11.4s, v15.4s // ................................................................................................|.....................................*...................... - // mul v15.4s, v24.4s, v0.s[0] // ................................................................................................|........................................*................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................................................................|.........................................*.................. - // mls v15.4s, v24.4s, v8.s[0] // ................................................................................................|.............................................*.............. - // sub v24.4s, v12.4s, v16.4s // ................................................................................................|......................................*..................... - // add v12.4s, v12.4s, v16.4s // ................................................................................................|..........................................*................. - // mul v16.4s, v24.4s, v0.s[0] // ................................................................................................|...........................................*................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................................................................|............................................*............... - // mls v16.4s, v24.4s, v8.s[0] // .*..............................................................................................|................................................*........... - // str q9, [x1], #(16*4) // .............................................................................................e..|............................................................ - // str q10, [x1, #(-16*4 + 1*16)] // ................................................................................................|..............................................*............. - // str q11, [x1, #(-16*4 + 2*16)] // ..*.............................................................................................|.................................................*.......... - // str q12, [x1, #(-16*4 + 3*16)] // ....*...........................................................................................|...................................................*........ - // str q13, [x2], #(16*4) // ...............................................................................................e|............................................................ - // str q14, [x2, #(-16*4 + 1*16)] // .......*........................................................................................|......................................................*..... - // str q15, [x2, #(-16*4 + 2*16)] // .........*......................................................................................|........................................................*... - // str q16, [x2, #(-16*4 + 3*16)] // ...........*....................................................................................|..........................................................*. - // add x1, x1, #64 // .....*..........................................................................................|....................................................*....... - // add x2, x2, #64 // ............*...................................................................................|...........................................................* + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // e.....................................................................................................................................................|.e................................. + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // ..................................e...................................................................................................................|................................... + // ldr q0, [x5], #(12*16) // .e....................................................................................................................................................|..e................................ + // ldr q4, [x5, #(-12*16 + 1*16)] // ....e.................................................................................................................................................|.....e............................. + // ldr q1, [x5, #(-12*16 + 2*16)] // ..........e...........................................................................................................................................|...........e....................... + // ldr q5, [x5, #(-12*16 + 3*16)] // ..............e.......................................................................................................................................|...............e................... + // ldr q2, [x5, #(-12*16 + 4*16)] // ......e...............................................................................................................................................|.......e........................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ...............e......................................................................................................................................|................e.................. + // sub v24.4s, v9.4s, v10.4s // .............e........................................................................................................................................|..............e.................... + // add v9.4s, v9.4s, v10.4s // ...e..................................................................................................................................................|....e.............................. + // mul v10.4s, v24.4s, v1.4s // .................e....................................................................................................................................|..................e................ + // sqrdmulh v24.4s, v24.4s, v5.4s // ................e.....................................................................................................................................|.................e................. + // mls v10.4s, v24.4s, v8.s[0] // .....................e................................................................................................................................|......................e............ + // sub v24.4s, v11.4s, v12.4s // .........e............................................................................................................................................|..........e........................ + // add v11.4s, v11.4s, v12.4s // ..e...................................................................................................................................................|...e............................... + // mul v12.4s, v24.4s, v2.4s // ............e.........................................................................................................................................|.............e..................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ..................e...................................................................................................................................|...................e............... + // mls v12.4s, v24.4s, v8.s[0] // ......................e...............................................................................................................................|.......................e........... + // sub v24.4s, v9.4s, v11.4s // .....e................................................................................................................................................|......e............................ + // add v9.4s, v9.4s, v11.4s // ................................................................e.....................................................................................|................................... + // mul v11.4s, v24.4s, v0.4s // .......e..............................................................................................................................................|........e.......................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ........e.............................................................................................................................................|.........e......................... + // mls v11.4s, v24.4s, v8.s[0] // ...........e..........................................................................................................................................|............e...................... + // sub v24.4s, v10.4s, v12.4s // ..........................e...........................................................................................................................|...........................e....... + // add v10.4s, v10.4s, v12.4s // ..................................................e...................................................................................................|................................... + // mul v12.4s, v24.4s, v0.4s // ............................................................e.........................................................................................|................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ............................e.........................................................................................................................|.............................e..... + // mls v12.4s, v24.4s, v8.s[0] // .................................................................e....................................................................................|................................... + // ldr q0, [x5, #(-12*16 + 6*16)] // .....................................................e................................................................................................|................................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ................................................e.....................................................................................................|................................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ...........................e..........................................................................................................................|............................e...... + // ldr q5, [x5, #(-12*16 + 9*16)] // ...............................e......................................................................................................................|................................e.. + // ldr q2, [x5, #(-12*16 + 10*16)] // ...................................e..................................................................................................................|................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // .....................................e................................................................................................................|................................... + // sub v24.4s, v13.4s, v14.4s // ......................................e...............................................................................................................|................................... + // add v13.4s, v13.4s, v14.4s // ...........................................e..........................................................................................................|................................... + // mul v14.4s, v24.4s, v1.4s // ..........................................e...........................................................................................................|................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .........................................e............................................................................................................|................................... + // mls v14.4s, v24.4s, v8.s[0] // ..............................................e.......................................................................................................|................................... + // sub v24.4s, v15.4s, v16.4s // ....................................e.................................................................................................................|................................... + // add v15.4s, v15.4s, v16.4s // ............................................e.........................................................................................................|................................... + // mul v16.4s, v24.4s, v2.4s // .......................................e..............................................................................................................|................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ........................................e.............................................................................................................|................................... + // mls v16.4s, v24.4s, v8.s[0] // .............................................e........................................................................................................|................................... + // sub v24.4s, v13.4s, v15.4s // ...............................................e......................................................................................................|................................... + // add v13.4s, v13.4s, v15.4s // ......................................................e...............................................................................................|................................... + // mul v15.4s, v24.4s, v0.4s // .........................................................e............................................................................................|................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................................e..................................................................................................|................................... + // mls v15.4s, v24.4s, v8.s[0] // .............................................................e........................................................................................|................................... + // sub v24.4s, v14.4s, v16.4s // .................................................e....................................................................................................|................................... + // add v14.4s, v14.4s, v16.4s // .......................................................e..............................................................................................|................................... + // mul v16.4s, v24.4s, v0.4s // ........................................................e.............................................................................................|................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ....................................................e.................................................................................................|................................... + // mls v16.4s, v24.4s, v8.s[0] // ..................................................................e...................................................................................|................................... + // trn1 v25.4s, v9.4s, v10.4s // ...................................................................e..................................................................................|................................... + // trn2 v26.4s, v9.4s, v10.4s // ....................................................................e.................................................................................|................................... + // trn1 v27.4s, v11.4s, v12.4s // .....................................................................e................................................................................|................................... + // trn2 v28.4s, v11.4s, v12.4s // ......................................................................e...............................................................................|................................... + // trn2 v11.2d, v25.2d, v27.2d // ........................................................................e.............................................................................|................................... + // trn2 v12.2d, v26.2d, v28.2d // .........................................................................e............................................................................|................................... + // trn1 v9.2d, v25.2d, v27.2d // .....................................................................................e................................................................|................................... + // trn1 v10.2d, v26.2d, v28.2d // ......................................................................................e...............................................................|................................... + // trn1 v25.4s, v13.4s, v14.4s // ...........................................................e..........................................................................................|................................... + // trn2 v26.4s, v13.4s, v14.4s // ..........................................................e...........................................................................................|................................... + // trn1 v27.4s, v15.4s, v16.4s // .......................................................................e..............................................................................|................................... + // trn2 v28.4s, v15.4s, v16.4s // ..........................................................................e...........................................................................|................................... + // trn2 v15.2d, v25.2d, v27.2d // ............................................................................e.........................................................................|................................... + // trn2 v16.2d, v26.2d, v28.2d // .............................................................................e........................................................................|................................... + // trn1 v13.2d, v25.2d, v27.2d // ..................................................................................e...................................................................|................................... + // trn1 v14.2d, v26.2d, v28.2d // .................................................................................e....................................................................|................................... + // ldr q0, [x4], #64 // ........................................................................................................................e.............................|................................... + // ldr q1, [x4, #(-64 + 16)] // ..............................................................e.......................................................................................|................................... + // ldr q2, [x4, #(-64 + 32)] // ...............................................................e......................................................................................|................................... + // ldr q3, [x4, #(-64 + 48)] // ................................................................................e.....................................................................|................................... + // sub v24.4s, v9.4s, v10.4s // .........................................................................................e............................................................|................................... + // add v9.4s, v9.4s, v10.4s // ............................................................................................................e.........................................|................................... + // mul v10.4s, v24.4s, v1.s[2] // ............................................................................................e.........................................................|................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .............................................................................................e........................................................|................................... + // mls v10.4s, v24.4s, v8.s[0] // .................................................................................................e....................................................|................................... + // sub v24.4s, v11.4s, v12.4s // ...........................................................................e..........................................................................|................................... + // add v11.4s, v11.4s, v12.4s // .............................................................................................................e........................................|................................... + // mul v12.4s, v24.4s, v2.s[0] // ..............................................................................e.......................................................................|................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........................................................................................e...........................................................|................................... + // mls v12.4s, v24.4s, v8.s[0] // ..............................................................................................e.......................................................|................................... + // sub v24.4s, v13.4s, v14.4s // ....................................................................................e.................................................................|................................... + // add v13.4s, v13.4s, v14.4s // ..................................................................................................e...................................................|................................... + // mul v14.4s, v24.4s, v2.s[2] // .......................................................................................e..............................................................|................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ........................................................................................e.............................................................|................................... + // mls v14.4s, v24.4s, v8.s[0] // ...............................................................................................e......................................................|................................... + // sub v24.4s, v15.4s, v16.4s // ...............................................................................e......................................................................|................................... + // add v15.4s, v15.4s, v16.4s // ...................................................................................................e..................................................|................................... + // mul v16.4s, v24.4s, v3.s[0] // ...................................................................................e..................................................................|................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...........................................................................................e..........................................................|................................... + // mls v16.4s, v24.4s, v8.s[0] // ................................................................................................e.....................................................|................................... + // sub v24.4s, v9.4s, v11.4s // ................................................................................................................................e.....................|................................... + // add v9.4s, v9.4s, v11.4s // ................................................................................................................e.....................................|................................... + // mul v11.4s, v24.4s, v0.s[2] // ......................................................................................................................................e...............|................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................................e..............|................................... + // mls v11.4s, v24.4s, v8.s[0] // ............................................................................................................................................e.........|................................... + // sub v24.4s, v10.4s, v12.4s // .......................................................................................................................e..............................|................................... + // add v10.4s, v10.4s, v12.4s // .....................................................................................................e................................................|................................... + // mul v12.4s, v24.4s, v0.s[2] // ...........................................................................................................................e..........................|................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................................................................................................e........................|................................... + // mls v12.4s, v24.4s, v8.s[0] // .................................................................................................................................e....................|................................... + // sub v24.4s, v13.4s, v15.4s // ......................................................................................................e...............................................|................................... + // add v13.4s, v13.4s, v15.4s // .........................................................................................................e............................................|................................... + // mul v15.4s, v24.4s, v1.s[0] // ..........................................................................................................................................e...........|................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ....................................................................................................................................e.................|................................... + // mls v15.4s, v24.4s, v8.s[0] // ..............................................................................................................................................e.......|................................... + // sub v24.4s, v14.4s, v16.4s // ..............................................................................................................e.......................................|................................... + // add v14.4s, v14.4s, v16.4s // ....................................................................................................e.................................................|................................... + // mul v16.4s, v24.4s, v1.s[0] // ....................................................................................................................e.................................|................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................................................................................................................e...........................|................................... + // mls v16.4s, v24.4s, v8.s[0] // ..............................................................................................................................e.......................|................................... + // srshr v24.4S, v9.4S, #23 // ...................................................................................................................e..................................|................................... + // mls v9.4s, v24.4s, v8.4s // ......................................................................................................................e...............................|................................... + // srshr v24.4S, v10.4S, #23 // ........................................................................................................e.............................................|................................... + // mls v10.4s, v24.4s, v8.4s // ...........................................................................................................e..........................................|................................... + // srshr v24.4S, v13.4S, #23 // .................................................................................................................e....................................|................................... + // mls v13.4s, v24.4s, v8.4s // .....................................................................................................................e................................|................................... + // srshr v24.4S, v14.4S, #23 // .......................................................................................................e..............................................|................................... + // mls v14.4s, v24.4s, v8.4s // ..........................................................................................................e...........................................|................................... + // sub v24.4s, v9.4s, v13.4s // ...............................................................................................................................e......................|................................... + // add v9.4s, v9.4s, v13.4s // .........................................................................................................................e............................|................................... + // mul v13.4s, v24.4s, v0.s[0] // ...................................................................................................................................e..................|................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................e...................|................................... + // mls v13.4s, v24.4s, v8.s[0] // ...........................................................................................................................................e..........|................................... + // sub v24.4s, v10.4s, v14.4s // ......................................................................................................................................................*................................... + // add v10.4s, v10.4s, v14.4s // ...............................................................................................................e......................................|................................... + // mul v14.4s, v24.4s, v0.s[0] // ....................*.................................................................................................................................|.....................*............. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................*..................................................................................................................................|....................*.............. + // mls v14.4s, v24.4s, v8.s[0] // ........................*.............................................................................................................................|.........................*......... + // sub v24.4s, v11.4s, v15.4s // ....................................................................................................................................................e.|................................... + // add v11.4s, v11.4s, v15.4s // ..................................................................................................................................................e...|................................... + // mul v15.4s, v24.4s, v0.s[0] // .........................*............................................................................................................................|..........................*........ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................*..............................................................................................................................|........................*.......... + // mls v15.4s, v24.4s, v8.s[0] // .............................*........................................................................................................................|..............................*.... + // sub v24.4s, v12.4s, v16.4s // .....................................................................................................................................e................|................................... + // add v12.4s, v12.4s, v16.4s // ................................................................................................................................................e.....|................................... + // mul v16.4s, v24.4s, v0.s[0] // ........................................................................................................................................e.............|................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................................................................................................................................e............|................................... + // mls v16.4s, v24.4s, v8.s[0] // .............................................................................................................................................e........|................................... + // str q9, [x1], #(16*4) // ............................................................................................................................e.........................|................................... + // str q10, [x1, #(-16*4 + 1*16)] // ..................................................................................................................e...................................|................................... + // str q11, [x1, #(-16*4 + 2*16)] // .....................................................................................................................................................e|................................... + // str q12, [x1, #(-16*4 + 3*16)] // ...................................................................................................................................................e..|................................... + // str q13, [x2], #(16*4) // ...............................................................................................................................................e......|................................... + // str q14, [x2, #(-16*4 + 1*16)] // ..............................*.......................................................................................................................|...............................*... + // str q15, [x2, #(-16*4 + 2*16)] // ................................*.....................................................................................................................|.................................*. + // str q16, [x2, #(-16*4 + 3*16)] // .................................................................................................................................................e....|................................... + // add x1, x1, #64 // ......................................................................................................................................................|*.................................. + // add x2, x2, #64 // .................................*....................................................................................................................|..................................* sub count, count, #1 cbnz count, layer45678_start - sub v29.4S, v21.4S, v16.4S // ..*..................................................... - // gap // ........................................................ - sub v19.4S, v25.4S, v4.4S // ...........*............................................ - // gap // ........................................................ - sub v26.4S, v17.4S, v20.4S // .*...................................................... - // gap // ........................................................ - mul v21.4S, v29.4S, v1.S[2] // .......*................................................ - // gap // ........................................................ - sqrdmulh v16.4S, v29.4S, v1.S[3] // ........*............................................... - // gap // ........................................................ - sqrdmulh v23.4S, v19.4S, v6.S[1] // ...............*........................................ - // gap // ........................................................ - sub v29.4S, v30.4S, v0.4S // *....................................................... - // gap // ........................................................ - mul v0.4S, v19.4S, v6.S[0] // ..............*......................................... - // gap // ........................................................ - sqrdmulh v20.4S, v26.4S, v1.S[1] // ......*................................................. - // gap // ........................................................ - mul v4.4S, v29.4S, v10.S[2] // ...*.................................................... - // gap // ........................................................ - sqrdmulh v27.4S, v29.4S, v10.S[3] // ....*................................................... - // gap // ........................................................ - mul v6.4S, v26.4S, v1.S[0] // .....*.................................................. - // gap // ........................................................ - mls v21.4S, v16.4S, v8.S[0] // ............*........................................... - // gap // ........................................................ - mls v0.4S, v23.4S, v8.S[0] // ....................*................................... - // gap // ........................................................ - mls v4.4S, v27.4S, v8.S[0] // .........*.............................................. - // gap // ........................................................ - mls v6.4S, v20.4S, v8.S[0] // ..........*............................................. - // gap // ........................................................ - sub v20.4S, v11.4S, v13.4S // .............*.......................................... - // gap // ........................................................ - sub v2.4S, v21.4S, v0.4S // .........................*.............................. - // gap // ........................................................ - add v17.4S, v21.4S, v0.4S // ..........................*............................. - // gap // ........................................................ - sub v23.4S, v4.4S, v6.4S // ..................*..................................... - // gap // ........................................................ - mul v26.4S, v2.4S, v10.S[0] // .............................*.......................... - // gap // ........................................................ - sqrdmulh v30.4S, v2.4S, v10.S[1] // ..............................*......................... - // gap // ........................................................ - sqrdmulh v0.4S, v23.4S, v3.S[3] // .......................*................................ - // gap // ........................................................ - mul v1.4S, v23.4S, v3.S[2] // ......................*................................. - // gap // ........................................................ - add v14.4S, v4.4S, v6.4S // ...................*.................................... - // gap // ........................................................ - sqrdmulh v11.4S, v20.4S, v3.S[3] // .................*...................................... - // gap // ........................................................ - mls v26.4S, v30.4S, v8.S[0] // ..................................*..................... - // gap // ........................................................ - mls v1.4S, v0.4S, v8.S[0] // ...........................*............................ - // gap // ........................................................ - mul v30.4S, v7.4S, v10.S[0] // ........................*............................... - // gap // ........................................................ - sub v21.4S, v14.4S, v17.4S // ...............................*........................ - // gap // ........................................................ - sqrdmulh v25.4S, v7.4S, v10.S[1] // ............................*........................... - // gap // ........................................................ - sub v16.4S, v1.4S, v26.4S // .......................................*................ - // gap // ........................................................ - mul v23.4S, v21.4S, v3.S[0] // ...................................*.................... - // gap // ........................................................ - mul v13.4S, v20.4S, v3.S[2] // ................*....................................... - // gap // ........................................................ - sqrdmulh v2.4S, v16.4S, v3.S[1] // .............................................*.......... - // gap // ........................................................ - mul v0.4S, v16.4S, v3.S[0] // ............................................*........... - // gap // ........................................................ - sqrdmulh v20.4S, v21.4S, v3.S[1] // ....................................*................... - // gap // ........................................................ - mls v30.4S, v25.4S, v8.S[0] // ................................*....................... - // gap // ........................................................ - mls v13.4S, v11.4S, v8.S[0] // .....................*.................................. - // gap // ........................................................ - mls v0.4S, v2.4S, v8.S[0] // ................................................*....... - // gap // ........................................................ - add v2.4S, v1.4S, v26.4S // ...........................................*............ - // gap // ........................................................ - mls v23.4S, v20.4S, v8.S[0] // ........................................*............... - // gap // ........................................................ - sub v16.4S, v13.4S, v30.4S // .....................................*.................. - // gap // ........................................................ - str q0, [x2, #-16] // ......................................................*. - // gap // ........................................................ - add v0.4S, v13.4S, v30.4S // ......................................*................. - // gap // ........................................................ - str q23, [x2, #-48] // ....................................................*... - // gap // ........................................................ - sqrdmulh v23.4S, v16.4S, v3.S[1] // ..........................................*............. - // gap // ........................................................ - mul v30.4S, v16.4S, v3.S[0] // .........................................*.............. - // gap // ........................................................ - str q0, [x1, #-32] // .................................................*...... - // gap // ........................................................ - add v0.4S, v14.4S, v17.4S // .................................*...................... - // gap // ........................................................ - str q2, [x1, #-16] // ..................................................*..... - // gap // ........................................................ - mls v30.4S, v23.4S, v8.S[0] // ..............................................*......... - // gap // ........................................................ - str q0, [x1, #-48] // ...............................................*........ - add x1, x1, #64 // ...................................................*.... - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - str q30, [x2, #-32] // .....................................................*.. - add x2, x2, #64 // .......................................................* + sub v4.4S, v27.4S, v4.4S // *.......... + add x1, x1, #64 // .*......... + sqrdmulh v17.4S, v10.4S, v29.S[1] // ....*...... + // gap // ........... + mul v0.4S, v10.4S, v29.S[0] // ......*.... + // gap // ........... + sqrdmulh v31.4S, v4.4S, v29.S[1] // ..*........ + // gap // ........... + mul v29.4S, v4.4S, v29.S[0] // ...*....... + // gap // ........... + // gap // ........... + // gap // ........... + mls v0.4S, v17.4S, v8.S[0] // .......*... + // gap // ........... + // gap // ........... + // gap // ........... + mls v29.4S, v31.4S, v8.S[0] // .....*..... + // gap // ........... + // gap // ........... + // gap // ........... + str q0, [x2, #-32] // .........*. + // gap // ........... + // gap // ........... + // gap // ........... + str q29, [x2, #-48] // ........*.. + add x2, x2, #64 // ..........* // original source code - // sub v0.4S, v30.4S, v0.4S // ......*................................................. - // sub v2.4S, v17.4S, v20.4S // ..*..................................................... - // sub v16.4S, v21.4S, v16.4S // *....................................................... - // mul v23.4S, v0.4S, v10.S[2] // .........*.............................................. - // sqrdmulh v0.4S, v0.4S, v10.S[3] // ..........*............................................. - // mul v21.4S, v2.4S, v1.S[0] // ...........*............................................ - // sqrdmulh v2.4S, v2.4S, v1.S[1] // ........*............................................... - // mul v26.4S, v16.4S, v1.S[2] // ...*.................................................... - // sqrdmulh v16.4S, v16.4S, v1.S[3] // ....*................................................... - // mls v23.4S, v0.4S, v8.S[0] // ..............*......................................... - // mls v21.4S, v2.4S, v8.S[0] // ...............*........................................ - // sub v0.4S, v25.4S, v4.4S // .*...................................................... - // mls v26.4S, v16.4S, v8.S[0] // ............*........................................... - // sub v2.4S, v11.4S, v13.4S // ................*....................................... - // mul v16.4S, v0.4S, v6.S[0] // .......*................................................ - // sqrdmulh v0.4S, v0.4S, v6.S[1] // .....*.................................................. - // mul v20.4S, v2.4S, v3.S[2] // .................................*...................... - // sqrdmulh v2.4S, v2.4S, v3.S[3] // .........................*.............................. - // sub v17.4S, v23.4S, v21.4S // ...................*.................................... - // add v23.4S, v23.4S, v21.4S // ........................*............................... - // mls v16.4S, v0.4S, v8.S[0] // .............*.......................................... - // mls v20.4S, v2.4S, v8.S[0] // ......................................*................. - // mul v0.4S, v17.4S, v3.S[2] // .......................*................................ - // sqrdmulh v2.4S, v17.4S, v3.S[3] // ......................*................................. - // mul v21.4S, v7.4S, v10.S[0] // ............................*........................... - // sub v17.4S, v26.4S, v16.4S // .................*...................................... - // add v16.4S, v26.4S, v16.4S // ..................*..................................... - // mls v0.4S, v2.4S, v8.S[0] // ...........................*............................ - // sqrdmulh v2.4S, v7.4S, v10.S[1] // ..............................*......................... - // mul v26.4S, v17.4S, v10.S[0] // ....................*................................... - // sqrdmulh v17.4S, v17.4S, v10.S[1] // .....................*.................................. - // sub v30.4S, v23.4S, v16.4S // .............................*.......................... - // mls v21.4S, v2.4S, v8.S[0] // .....................................*.................. - // add v2.4S, v23.4S, v16.4S // .................................................*...... - // mls v26.4S, v17.4S, v8.S[0] // ..........................*............................. - // mul v16.4S, v30.4S, v3.S[0] // ................................*....................... - // sqrdmulh v23.4S, v30.4S, v3.S[1] // ....................................*................... - // sub v17.4S, v20.4S, v21.4S // ..........................................*............. - // add v21.4S, v20.4S, v21.4S // ............................................*........... - // sub v20.4S, v0.4S, v26.4S // ...............................*........................ - // mls v16.4S, v23.4S, v8.S[0] // .........................................*.............. - // mul v23.4S, v17.4S, v3.S[0] // ...............................................*........ - // sqrdmulh v17.4S, v17.4S, v3.S[1] // ..............................................*......... - // add v0.4S, v0.4S, v26.4S // ........................................*............... - // mul v26.4S, v20.4S, v3.S[0] // ...................................*.................... - // sqrdmulh v20.4S, v20.4S, v3.S[1] // ..................................*..................... - // mls v23.4S, v17.4S, v8.S[0] // ...................................................*.... - // str q2, [x1, #-48] // ....................................................*... - // mls v26.4S, v20.4S, v8.S[0] // .......................................*................ - // str q21, [x1, #-32] // ................................................*....... - // str q0, [x1, #-16] // ..................................................*..... - // add x1, x1, #64 // .....................................................*.. - // str q16, [x2, #-48] // .............................................*.......... - // str q23, [x2, #-32] // ......................................................*. - // str q26, [x2, #-16] // ...........................................*............ - // add x2, x2, #64 // .......................................................* + // sub v6.4S, v27.4S, v4.4S // *.......... + // add x1, x1, #64 // .*......... + // sqrdmulh v12.4S, v6.4S, v29.S[1] // ....*...... + // mul v6.4S, v6.4S, v29.S[0] // .....*..... + // sqrdmulh v9.4S, v10.4S, v29.S[1] // ..*........ + // mls v6.4S, v12.4S, v8.S[0] // .......*... + // mul v13.4S, v10.4S, v29.S[0] // ...*....... + // mls v13.4S, v9.4S, v8.S[0] // ......*.... + // str q6, [x2, #-48] // .........*. + // str q13, [x2, #-32] // ........*.. + // add x2, x2, #64 // ..........* // ----------------------------------------------------------------------------- ninv .req v25 ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 ASM_LOAD(xtmp, ninv_addr) ld1r {ninv.4s}, [xtmp] ASM_LOAD(xtmp, ninv_tw_addr) ld1r {ninv_tw.4s}, [xtmp] + ushr modulus_half.4S, consts.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + mov count, #8 ASM_LOAD(r_ptr0, roots_l012) load_roots_123 .p2align 2 - ldr q13, [x0, #768] // .....*...... + ldr q11, [x0, #256] // ..*......... // gap // ............ // gap // ............ // gap // ............ - ldr q7, [x0, #896] // .......*.... + ldr q20, [x0, #384] // ........*... // gap // ............ // gap // ............ // gap // ............ - ldr q4, [x0, #512] // ...*........ + ldr q4, [x0, #896] // .......*.... // gap // ............ // gap // ............ // gap // ............ - ldr q11, [x0, #640] // ....*....... + ldr q28, [x0, #512] // ...*........ // gap // ............ // gap // ............ // gap // ............ - ldr q20, [x0, #384] // ........*... + ldr q9, [x0, #768] // .....*...... // gap // ............ // gap // ............ // gap // ............ - ldr q21, [x0, #256] // ..*......... + ldr q29, [x0, #640] // ....*....... // gap // ............ // gap // ............ // gap // ............ - add v19.4S, v4.4S, v11.4S // ......*..... + add v15.4S, v11.4S, v20.4S // ..........*. // gap // ............ - add v6.4S, v13.4S, v7.4S // .........*.. + add v22.4S, v9.4S, v4.4S // .........*.. // gap // ............ - ldr q17, [x0, #0] // *........... + add v13.4S, v28.4S, v29.4S // ......*..... // gap // ............ + ldr q18, [x0, #0] // *........... // gap // ............ // gap // ............ - add v30.4S, v21.4S, v20.4S // ..........*. // gap // ............ - add v10.4S, v19.4S, v6.4S // ...........* + add v23.4S, v13.4S, v22.4S // ...........* // gap // ............ - ldr q23, [x0, #128] // .*.......... + ldr q14, [x0, #128] // .*.......... // gap // ............ // original source code - // ldr q17, [x0, #0] // ........*... - // ldr q23, [x0, #128] // ...........* - // ldr q21, [x0, #256] // .....*...... - // ldr q4, [x0, #512] // ..*......... - // ldr q11, [x0, #640] // ...*........ - // ldr q13, [x0, #768] // *........... - // add v19.4S, v4.4S, v11.4S // ......*..... - // ldr q7, [x0, #896] // .*.......... - // ldr q20, [x0, #384] // ....*....... - // add v6.4S, v13.4S, v7.4S // .......*.... - // add v30.4S, v21.4S, v20.4S // .........*.. - // add v10.4S, v19.4S, v6.4S // ..........*. + // ldr q18, [x0, #0] // .........*.. + // ldr q14, [x0, #128] // ...........* + // ldr q11, [x0, #256] // *........... + // ldr q28, [x0, #512] // ...*........ + // ldr q29, [x0, #640] // .....*...... + // ldr q9, [x0, #768] // ....*....... + // add v13.4S, v28.4S, v29.4S // ........*... + // ldr q4, [x0, #896] // ..*......... + // ldr q20, [x0, #384] // .*.......... + // add v22.4S, v9.4S, v4.4S // .......*.... + // add v15.4S, v11.4S, v20.4S // ......*..... + // add v23.4S, v13.4S, v22.4S // ..........*. sub count, count, #1 layer123_start: - sub v16.4S, v17.4S, v23.4S // ........*....................................................................................... - // gap // ................................................................................................ - add v23.4S, v17.4S, v23.4S // .........*...................................................................................... - // gap // ................................................................................................ - sub v21.4S, v21.4S, v20.4S // .............*.................................................................................. - // gap // ................................................................................................ - mul v20.4S, v16.4S, v1.S[2] // ..........*..................................................................................... - // gap // ................................................................................................ - sqrdmulh v16.4S, v16.4S, v1.S[3] // ...........*.................................................................................... - // gap // ................................................................................................ - sub v17.4S, v23.4S, v30.4S // ............................*................................................................... - // gap // ................................................................................................ - add v23.4S, v23.4S, v30.4S // .............................*.................................................................. - // gap // ................................................................................................ - mul v30.4S, v21.4S, v2.S[0] // ...............*................................................................................ - // gap // ................................................................................................ - sqrdmulh v21.4S, v21.4S, v2.S[1] // ................*............................................................................... - // gap // ................................................................................................ - mls v20.4S, v16.4S, v8.S[0] // ............*................................................................................... - // gap // ................................................................................................ - sub v16.4S, v4.4S, v11.4S // ..................*............................................................................. - // gap // ................................................................................................ - mul v4.4S, v17.4S, v0.S[2] // ..............................*................................................................. - // gap // ................................................................................................ - sqrdmulh v17.4S, v17.4S, v0.S[3] // ...............................*................................................................ - // gap // ................................................................................................ - sub v11.4S, v23.4S, v10.4S // ................................................*............................................... - // gap // ................................................................................................ - add v23.4S, v23.4S, v10.4S // .................................................*.............................................. - // gap // ................................................................................................ - mls v30.4S, v21.4S, v8.S[0] // .................*.............................................................................. - // gap // ................................................................................................ - mul v21.4S, v16.4S, v2.S[2] // ....................*........................................................................... - // gap // ................................................................................................ - sqrdmulh v16.4S, v16.4S, v2.S[3] // .....................*.......................................................................... - // gap // ................................................................................................ - sub v13.4S, v13.4S, v7.4S // .......................*........................................................................ - // gap // ................................................................................................ - sub v7.4S, v20.4S, v30.4S // .................................*.............................................................. - // gap // ................................................................................................ - add v20.4S, v20.4S, v30.4S // ..................................*............................................................. - // gap // ................................................................................................ - mls v21.4S, v16.4S, v8.S[0] // ......................*......................................................................... - // gap // ................................................................................................ - mul v16.4S, v13.4S, v3.S[0] // .........................*...................................................................... - // gap // ................................................................................................ - mls v4.4S, v17.4S, v8.S[0] // ................................*............................................................... - // gap // ................................................................................................ - sqrdmulh v17.4S, v13.4S, v3.S[1] // ..........................*..................................................................... - // gap // ................................................................................................ - mul v30.4S, v7.4S, v0.S[2] // ...................................*............................................................ - // gap // ................................................................................................ - sqrdmulh v13.4S, v7.4S, v0.S[3] // ....................................*........................................................... - // gap // ................................................................................................ - mul v7.4S, v11.4S, v0.S[0] // ..................................................*............................................. - // gap // ................................................................................................ - sqrdmulh v11.4S, v11.4S, v0.S[1] // ...................................................*............................................ - // gap // ................................................................................................ - mul v10.4S, v23.4S, v25.4S // ................................................................................*............... - // gap // ................................................................................................ - sqrdmulh v23.4S, v23.4S, v26.4S // .................................................................................*.............. - // gap // ................................................................................................ - mls v16.4S, v17.4S, v8.S[0] // ...........................*.................................................................... - // gap // ................................................................................................ - mls v30.4S, v13.4S, v8.S[0] // .....................................*.......................................................... - // gap // ................................................................................................ - sub v17.4S, v19.4S, v6.4S // ......................................*......................................................... - // gap // ................................................................................................ - mls v7.4S, v11.4S, v8.S[0] // ....................................................*........................................... - // gap // ................................................................................................ - sub v11.4S, v21.4S, v16.4S // ...........................................*.................................................... - // gap // ................................................................................................ - mul v13.4S, v17.4S, v1.S[0] // ........................................*....................................................... - // gap // ................................................................................................ - sqrdmulh v17.4S, v17.4S, v1.S[1] // .........................................*...................................................... - // gap // ................................................................................................ - add v16.4S, v21.4S, v16.4S // ............................................*................................................... - // gap // ................................................................................................ - mul v21.4S, v11.4S, v1.S[0] // .............................................*.................................................. - // gap // ................................................................................................ - sqrdmulh v11.4S, v11.4S, v1.S[1] // ..............................................*................................................. - // gap // ................................................................................................ - sub v19.4S, v20.4S, v16.4S // .....................................................*.......................................... - // gap // ................................................................................................ - add v16.4S, v20.4S, v16.4S // ......................................................*......................................... - // gap // ................................................................................................ - mls v13.4S, v17.4S, v8.S[0] // ..........................................*..................................................... - // gap // ................................................................................................ - mls v21.4S, v11.4S, v8.S[0] // ...............................................*................................................ - // gap // ................................................................................................ - mul v20.4S, v19.4S, v0.S[0] // .......................................................*........................................ - // gap // ................................................................................................ - sqrdmulh v17.4S, v19.4S, v0.S[1] // ........................................................*....................................... - // gap // ................................................................................................ - sub v11.4S, v4.4S, v13.4S // ..........................................................*..................................... - // gap // ................................................................................................ - add v4.4S, v4.4S, v13.4S // ...........................................................*.................................... - // gap // ................................................................................................ - sub v13.4S, v30.4S, v21.4S // ...............................................................*................................ - // gap // ................................................................................................ - mls v20.4S, v17.4S, v8.S[0] // .........................................................*...................................... - // gap // ................................................................................................ - mul v17.4S, v11.4S, v0.S[0] // ............................................................*................................... - // gap // ................................................................................................ - sqrdmulh v11.4S, v11.4S, v0.S[1] // .............................................................*.................................. - // gap // ................................................................................................ - add v21.4S, v30.4S, v21.4S // ................................................................*............................... - // gap // ................................................................................................ - mul v30.4S, v13.4S, v0.S[0] // .................................................................*.............................. - // gap // ................................................................................................ - sqrdmulh v13.4S, v13.4S, v0.S[1] // ..................................................................*............................. - // gap // ................................................................................................ - mls v17.4S, v11.4S, v8.S[0] // ..............................................................*................................. - // gap // ................................................................................................ - srshr v11.4S, v7.4S, #23 // ....................................................................*........................... - // gap // ................................................................................................ - srshr v19.4S, v20.4S, #23 // ......................................................................*......................... - // gap // ................................................................................................ - mls v10.4S, v23.4S, v8.S[0] // ..................................................................................*............. - // gap // ................................................................................................ - mls v30.4S, v13.4S, v8.S[0] // ...................................................................*............................ - // gap // ................................................................................................ - mls v7.4S, v11.4S, v8.4S // .....................................................................*.......................... - // gap // ................................................................................................ - mls v20.4S, v19.4S, v8.4S // .......................................................................*........................ - // gap // ................................................................................................ - srshr v23.4S, v17.4S, #23 // ........................................................................*....................... - // gap // ................................................................................................ - srshr v11.4S, v30.4S, #23 // ..........................................................................*..................... - // gap // ................................................................................................ - str q7, [x0, #512] // ............................................................................*................... - // gap // ................................................................................................ - mls v17.4S, v23.4S, v8.4S // .........................................................................*...................... - // gap // ................................................................................................ - mls v30.4S, v11.4S, v8.4S // ...........................................................................*.................... - // gap // ................................................................................................ - str q20, [x0, #640] // .............................................................................*.................. - // gap // ................................................................................................ - mul v23.4S, v16.4S, v25.4S // ...................................................................................*............ - // gap // ................................................................................................ - str q17, [x0, #768] // ..............................................................................*................. - // gap // ................................................................................................ - sqrdmulh v16.4S, v16.4S, v26.4S // ....................................................................................*........... - // gap // ................................................................................................ - str q30, [x0, #896] // ...............................................................................*................ - // gap // ................................................................................................ - mul v20.4S, v4.4S, v25.4S // ......................................................................................*......... - // gap // ................................................................................................ - sqrdmulh v17.4S, v4.4S, v26.4S // .......................................................................................*........ - // gap // ................................................................................................ - mls v23.4S, v16.4S, v8.S[0] // .....................................................................................*.......... - // gap // ................................................................................................ - mul v16.4S, v21.4S, v25.4S // .........................................................................................*...... - // gap // ................................................................................................ - sqrdmulh v21.4S, v21.4S, v26.4S // ..........................................................................................*..... - // gap // ................................................................................................ - mls v20.4S, v17.4S, v8.S[0] // ........................................................................................*....... - // gap // ................................................................................................ - str q10, [x0], #(16) // ............................................................................................*... - // gap // ................................................................................................ - ldr q17, [x0, #0] // e............................................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v16.4S, v21.4S, v8.S[0] // ...........................................................................................*.... - // gap // ................................................................................................ - str q23, [x0, #112] // .............................................................................................*.. - // gap // ................................................................................................ - ldr q23, [x0, #128] // .e.............................................................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - str q20, [x0, #240] // ..............................................................................................*. - // gap // ................................................................................................ - ldr q21, [x0, #256] // ..e............................................................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - ldr q4, [x0, #512] // ....e........................................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - ldr q11, [x0, #640] // .....e.......................................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - ldr q13, [x0, #768] // ......e......................................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - add v19.4S, v4.4S, v11.4S // ...................e............................................................................ - // gap // ................................................................................................ - ldr q7, [x0, #896] // .......e........................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - ldr q20, [x0, #384] // ...e............................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - add v6.4S, v13.4S, v7.4S // ........................e....................................................................... - // gap // ................................................................................................ - str q16, [x0, #368] // ...............................................................................................* - // gap // ................................................................................................ - add v30.4S, v21.4S, v20.4S // ..............e................................................................................. - // gap // ................................................................................................ - add v10.4S, v19.4S, v6.4S // .......................................e........................................................ - // gap // ................................................................................................ + sub v17.4S, v18.4S, v14.4S // ........*............................................................................................................... + // gap // ........................................................................................................................ + add v19.4S, v18.4S, v14.4S // .........*.............................................................................................................. + // gap // ........................................................................................................................ + sub v18.4S, v11.4S, v20.4S // .............*.......................................................................................................... + // gap // ........................................................................................................................ + mul v6.4S, v17.4S, v1.S[2] // ..........*............................................................................................................. + // gap // ........................................................................................................................ + sqrdmulh v17.4S, v17.4S, v1.S[3] // ...........*............................................................................................................ + // gap // ........................................................................................................................ + sub v14.4S, v19.4S, v15.4S // ............................*........................................................................................... + // gap // ........................................................................................................................ + add v19.4S, v19.4S, v15.4S // .............................*.......................................................................................... + // gap // ........................................................................................................................ + mul v11.4S, v18.4S, v2.S[0] // ...............*........................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v18.4S, v18.4S, v2.S[1] // ................*....................................................................................................... + // gap // ........................................................................................................................ + mls v6.4S, v17.4S, v8.S[0] // ............*........................................................................................................... + // gap // ........................................................................................................................ + sub v17.4S, v28.4S, v29.4S // ..................*..................................................................................................... + // gap // ........................................................................................................................ + mul v28.4S, v14.4S, v0.S[2] // ..............................*......................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v14.4S, v14.4S, v0.S[3] // ...............................*........................................................................................ + // gap // ........................................................................................................................ + sub v29.4S, v19.4S, v23.4S // ................................................*....................................................................... + // gap // ........................................................................................................................ + add v19.4S, v19.4S, v23.4S // .................................................*...................................................................... + // gap // ........................................................................................................................ + mls v11.4S, v18.4S, v8.S[0] // .................*...................................................................................................... + // gap // ........................................................................................................................ + mul v18.4S, v17.4S, v2.S[2] // ....................*................................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v17.4S, v17.4S, v2.S[3] // .....................*.................................................................................................. + // gap // ........................................................................................................................ + sub v9.4S, v9.4S, v4.4S // .......................*................................................................................................ + // gap // ........................................................................................................................ + sub v4.4S, v6.4S, v11.4S // .................................*...................................................................................... + // gap // ........................................................................................................................ + add v6.4S, v6.4S, v11.4S // ..................................*..................................................................................... + // gap // ........................................................................................................................ + mls v18.4S, v17.4S, v8.S[0] // ......................*................................................................................................. + // gap // ........................................................................................................................ + mul v17.4S, v9.4S, v3.S[0] // .........................*.............................................................................................. + // gap // ........................................................................................................................ + mls v28.4S, v14.4S, v8.S[0] // ................................*....................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v14.4S, v9.4S, v3.S[1] // ..........................*............................................................................................. + // gap // ........................................................................................................................ + mul v11.4S, v4.4S, v0.S[2] // ...................................*.................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v9.4S, v4.4S, v0.S[3] // ....................................*................................................................................... + // gap // ........................................................................................................................ + mul v4.4S, v29.4S, v0.S[0] // ..................................................*..................................................................... + // gap // ........................................................................................................................ + sqrdmulh v29.4S, v29.4S, v0.S[1] // ...................................................*.................................................................... + // gap // ........................................................................................................................ + mul v20.4S, v19.4S, v25.4S // ........................................................................................*............................... + // gap // ........................................................................................................................ + sqrdmulh v19.4S, v19.4S, v26.4S // .........................................................................................*.............................. + // gap // ........................................................................................................................ + mls v17.4S, v14.4S, v8.S[0] // ...........................*............................................................................................ + // gap // ........................................................................................................................ + mls v11.4S, v9.4S, v8.S[0] // .....................................*.................................................................................. + // gap // ........................................................................................................................ + sub v14.4S, v13.4S, v22.4S // ......................................*................................................................................. + // gap // ........................................................................................................................ + mls v4.4S, v29.4S, v8.S[0] // ....................................................*................................................................... + // gap // ........................................................................................................................ + sub v29.4S, v18.4S, v17.4S // ...........................................*............................................................................ + // gap // ........................................................................................................................ + mul v9.4S, v14.4S, v1.S[0] // ........................................*............................................................................... + // gap // ........................................................................................................................ + sqrdmulh v14.4S, v14.4S, v1.S[1] // .........................................*.............................................................................. + // gap // ........................................................................................................................ + add v17.4S, v18.4S, v17.4S // ............................................*........................................................................... + // gap // ........................................................................................................................ + mul v18.4S, v29.4S, v1.S[0] // .............................................*.......................................................................... + // gap // ........................................................................................................................ + sqrdmulh v29.4S, v29.4S, v1.S[1] // ..............................................*......................................................................... + // gap // ........................................................................................................................ + sub v13.4S, v6.4S, v17.4S // .....................................................*.................................................................. + // gap // ........................................................................................................................ + add v17.4S, v6.4S, v17.4S // ......................................................*................................................................. + // gap // ........................................................................................................................ + mls v9.4S, v14.4S, v8.S[0] // ..........................................*............................................................................. + // gap // ........................................................................................................................ + mls v18.4S, v29.4S, v8.S[0] // ...............................................*........................................................................ + // gap // ........................................................................................................................ + mul v6.4S, v13.4S, v0.S[0] // .......................................................*................................................................ + // gap // ........................................................................................................................ + sqrdmulh v14.4S, v13.4S, v0.S[1] // ........................................................*............................................................... + // gap // ........................................................................................................................ + sub v29.4S, v28.4S, v9.4S // ..........................................................*............................................................. + // gap // ........................................................................................................................ + add v28.4S, v28.4S, v9.4S // ...........................................................*............................................................ + // gap // ........................................................................................................................ + sub v9.4S, v11.4S, v18.4S // ...............................................................*........................................................ + // gap // ........................................................................................................................ + mls v6.4S, v14.4S, v8.S[0] // .........................................................*.............................................................. + // gap // ........................................................................................................................ + mul v14.4S, v29.4S, v0.S[0] // ............................................................*........................................................... + // gap // ........................................................................................................................ + sqrdmulh v29.4S, v29.4S, v0.S[1] // .............................................................*.......................................................... + // gap // ........................................................................................................................ + add v18.4S, v11.4S, v18.4S // ................................................................*....................................................... + // gap // ........................................................................................................................ + mul v11.4S, v9.4S, v0.S[0] // .................................................................*...................................................... + // gap // ........................................................................................................................ + sqrdmulh v9.4S, v9.4S, v0.S[1] // ..................................................................*..................................................... + // gap // ........................................................................................................................ + mls v14.4S, v29.4S, v8.S[0] // ..............................................................*......................................................... + // gap // ........................................................................................................................ + cmge v29.4S, v31.4S, v4.4S // ....................................................................*................................................... + // gap // ........................................................................................................................ + cmge v13.4S, v4.4S, v30.4S // .....................................................................*.................................................. + // gap // ........................................................................................................................ + mls v20.4S, v19.4S, v8.S[0] // ..........................................................................................*............................. + // gap // ........................................................................................................................ + mls v11.4S, v9.4S, v8.S[0] // ...................................................................*.................................................... + // gap // ........................................................................................................................ + sub v19.4S, v29.4S, v13.4S // ......................................................................*................................................. + // gap // ........................................................................................................................ + cmge v29.4S, v31.4S, v6.4S // ........................................................................*............................................... + // gap // ........................................................................................................................ + cmge v9.4S, v6.4S, v30.4S // .........................................................................*.............................................. + // gap // ........................................................................................................................ + mls v4.4S, v19.4S, v8.4S // .......................................................................*................................................ + // gap // ........................................................................................................................ + sub v19.4S, v29.4S, v9.4S // ..........................................................................*............................................. + // gap // ........................................................................................................................ + cmge v29.4S, v31.4S, v14.4S // ............................................................................*........................................... + // gap // ........................................................................................................................ + cmge v9.4S, v14.4S, v30.4S // .............................................................................*.......................................... + // gap // ........................................................................................................................ + mls v6.4S, v19.4S, v8.4S // ...........................................................................*............................................ + // gap // ........................................................................................................................ + sub v19.4S, v29.4S, v9.4S // ..............................................................................*......................................... + // gap // ........................................................................................................................ + cmge v29.4S, v31.4S, v11.4S // ................................................................................*....................................... + // gap // ........................................................................................................................ + cmge v9.4S, v11.4S, v30.4S // .................................................................................*...................................... + // gap // ........................................................................................................................ + mls v14.4S, v19.4S, v8.4S // ...............................................................................*........................................ + // gap // ........................................................................................................................ + sub v19.4S, v29.4S, v9.4S // ..................................................................................*..................................... + // gap // ........................................................................................................................ + str q4, [x0, #512] // ....................................................................................*................................... + // gap // ........................................................................................................................ + mul v29.4S, v17.4S, v25.4S // ...........................................................................................*............................ + // gap // ........................................................................................................................ + mls v11.4S, v19.4S, v8.4S // ...................................................................................*.................................... + // gap // ........................................................................................................................ + str q6, [x0, #640] // .....................................................................................*.................................. + // gap // ........................................................................................................................ + sqrdmulh v17.4S, v17.4S, v26.4S // ............................................................................................*........................... + // gap // ........................................................................................................................ + str q14, [x0, #768] // ......................................................................................*................................. + // gap // ........................................................................................................................ + mul v19.4S, v28.4S, v25.4S // ..............................................................................................*......................... + // gap // ........................................................................................................................ + str q11, [x0, #896] // .......................................................................................*................................ + // gap // ........................................................................................................................ + mls v29.4S, v17.4S, v8.S[0] // .............................................................................................*.......................... + // gap // ........................................................................................................................ + sqrdmulh v17.4S, v28.4S, v26.4S // ...............................................................................................*........................ + // gap // ........................................................................................................................ + mul v6.4S, v18.4S, v25.4S // .................................................................................................*...................... + // gap // ........................................................................................................................ + sqrdmulh v18.4S, v18.4S, v26.4S // ..................................................................................................*..................... + // gap // ........................................................................................................................ + cmge v14.4S, v31.4S, v20.4S // ....................................................................................................*................... + // gap // ........................................................................................................................ + mls v19.4S, v17.4S, v8.S[0] // ................................................................................................*....................... + // gap // ........................................................................................................................ + cmge v17.4S, v20.4S, v30.4S // .....................................................................................................*.................. + // gap // ........................................................................................................................ + mls v6.4S, v18.4S, v8.S[0] // ...................................................................................................*.................... + // gap // ........................................................................................................................ + sub v17.4S, v14.4S, v17.4S // ......................................................................................................*................. + // gap // ........................................................................................................................ + cmge v18.4S, v31.4S, v29.4S // ........................................................................................................*............... + // gap // ........................................................................................................................ + cmge v14.4S, v29.4S, v30.4S // .........................................................................................................*.............. + // gap // ........................................................................................................................ + mls v20.4S, v17.4S, v8.4S // .......................................................................................................*................ + // gap // ........................................................................................................................ + sub v17.4S, v18.4S, v14.4S // ..........................................................................................................*............. + // gap // ........................................................................................................................ + cmge v18.4S, v31.4S, v19.4S // ............................................................................................................*........... + // gap // ........................................................................................................................ + cmge v14.4S, v19.4S, v30.4S // .............................................................................................................*.......... + // gap // ........................................................................................................................ + mls v29.4S, v17.4S, v8.4S // ...........................................................................................................*............ + // gap // ........................................................................................................................ + sub v17.4S, v18.4S, v14.4S // ..............................................................................................................*......... + // gap // ........................................................................................................................ + cmge v18.4S, v31.4S, v6.4S // ................................................................................................................*....... + // gap // ........................................................................................................................ + cmge v14.4S, v6.4S, v30.4S // .................................................................................................................*...... + // gap // ........................................................................................................................ + mls v19.4S, v17.4S, v8.4S // ...............................................................................................................*........ + // gap // ........................................................................................................................ + sub v17.4S, v18.4S, v14.4S // ..................................................................................................................*..... + // gap // ........................................................................................................................ + str q20, [x0], #(16) // ....................................................................................................................*... + // gap // ........................................................................................................................ + ldr q18, [x0, #0] // e....................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v6.4S, v17.4S, v8.4S // ...................................................................................................................*.... + // gap // ........................................................................................................................ + str q29, [x0, #112] // .....................................................................................................................*.. + // gap // ........................................................................................................................ + ldr q14, [x0, #128] // .e...................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q19, [x0, #240] // ......................................................................................................................*. + // gap // ........................................................................................................................ + ldr q11, [x0, #256] // ..e..................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q28, [x0, #512] // ....e................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q29, [x0, #640] // .....e.................................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q9, [x0, #768] // ......e................................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v13.4S, v28.4S, v29.4S // ...................e.................................................................................................... + // gap // ........................................................................................................................ + ldr q4, [x0, #896] // .......e................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q20, [x0, #384] // ...e.................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v22.4S, v9.4S, v4.4S // ........................e............................................................................................... + // gap // ........................................................................................................................ + str q6, [x0, #368] // .......................................................................................................................* + // gap // ........................................................................................................................ + add v15.4S, v11.4S, v20.4S // ..............e......................................................................................................... + // gap // ........................................................................................................................ + add v23.4S, v13.4S, v22.4S // .......................................e................................................................................ + // gap // ........................................................................................................................ // original source code - // ldr q9, [x0, #0] // e...............|...............................................................................e............. - // ldr q10, [x0, #(1*(1024/8))] // ...e............|..................................................................................e.......... - // ldr q11, [x0, #(2*(1024/8))] // .....e..........|....................................................................................e........ - // ldr q12, [x0, #(3*(1024/8))] // ...........e....|..........................................................................................e.. - // ldr q13, [x0, #(4*(1024/8))] // ......e.........|.....................................................................................e....... - // ldr q14, [x0, #(5*(1024/8))] // .......e........|......................................................................................e...... - // ldr q15, [x0, #(6*(1024/8))] // ........e.......|.......................................................................................e..... - // ldr q16, [x0, #(7*(1024/8))] // ..........e.....|.........................................................................................e... - // sub v24.4s, v9.4s, v10.4s // ................*............................................................................................. - // add v9.4s, v9.4s, v10.4s // ................|*............................................................................................ - // mul v10.4s, v24.4s, v1.s[2] // ................|..*.......................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ................|...*......................................................................................... - // mls v10.4s, v24.4s, v8.s[0] // ................|........*.................................................................................... - // sub v24.4s, v11.4s, v12.4s // ................|.*........................................................................................... - // add v11.4s, v11.4s, v12.4s // ..............e.|............................................................................................. - // mul v12.4s, v24.4s, v2.s[0] // ................|......*...................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................|.......*..................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ................|..............*.............................................................................. - // sub v24.4s, v13.4s, v14.4s // ................|.........*................................................................................... - // add v13.4s, v13.4s, v14.4s // .........e......|........................................................................................e.... - // mul v14.4s, v24.4s, v2.s[2] // ................|...............*............................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................|................*............................................................................ - // mls v14.4s, v24.4s, v8.s[0] // ................|....................*........................................................................ - // sub v24.4s, v15.4s, v16.4s // ................|.................*........................................................................... - // add v15.4s, v15.4s, v16.4s // ............e...|...........................................................................................e. - // mul v16.4s, v24.4s, v3.s[0] // ................|.....................*....................................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................|.......................*..................................................................... - // mls v16.4s, v24.4s, v8.s[0] // ................|..............................*.............................................................. - // sub v24.4s, v9.4s, v11.4s // ................|....*........................................................................................ - // add v9.4s, v9.4s, v11.4s // ................|.....*....................................................................................... - // mul v11.4s, v24.4s, v0.s[2] // ................|..........*.................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|...........*................................................................................. - // mls v11.4s, v24.4s, v8.s[0] // ................|......................*...................................................................... - // sub v24.4s, v10.4s, v12.4s // ................|..................*.......................................................................... - // add v10.4s, v10.4s, v12.4s // ................|...................*......................................................................... - // mul v12.4s, v24.4s, v0.s[2] // ................|........................*.................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|.........................*................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ................|...............................*............................................................. - // sub v24.4s, v13.4s, v15.4s // ................|................................*............................................................ - // add v13.4s, v13.4s, v15.4s // ...............e|............................................................................................. - // mul v15.4s, v24.4s, v1.s[0] // ................|...................................*......................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|....................................*........................................................ - // mls v15.4s, v24.4s, v8.s[0] // ................|..........................................*.................................................. - // sub v24.4s, v14.4s, v16.4s // ................|..................................*.......................................................... - // add v14.4s, v14.4s, v16.4s // ................|.....................................*....................................................... - // mul v16.4s, v24.4s, v1.s[0] // ................|......................................*...................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|.......................................*..................................................... - // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................*................................................. - // sub v24.4s, v9.4s, v13.4s // ................|............*................................................................................ - // add v9.4s, v9.4s, v13.4s // ................|.............*............................................................................... - // mul v13.4s, v24.4s, v0.s[0] // ................|..........................*.................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...........................*................................................................. - // mls v13.4s, v24.4s, v8.s[0] // ................|.................................*........................................................... - // sub v24.4s, v10.4s, v14.4s // ................|........................................*.................................................... - // add v10.4s, v10.4s, v14.4s // ................|.........................................*................................................... - // mul v14.4s, v24.4s, v0.s[0] // ................|............................................*................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|.............................................*............................................... - // mls v14.4s, v24.4s, v8.s[0] // ................|.................................................*........................................... - // sub v24.4s, v11.4s, v15.4s // ................|..............................................*.............................................. - // add v11.4s, v11.4s, v15.4s // ................|...............................................*............................................. - // mul v15.4s, v24.4s, v0.s[0] // ................|..................................................*.......................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...................................................*......................................... - // mls v15.4s, v24.4s, v8.s[0] // ................|.......................................................*..................................... - // sub v24.4s, v12.4s, v16.4s // ................|................................................*............................................ - // add v12.4s, v12.4s, v16.4s // ................|....................................................*........................................ - // mul v16.4s, v24.4s, v0.s[0] // ................|.....................................................*....................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|......................................................*...................................... - // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................................*................................. - // srshr v24.4S, v13.4S, #23 // ................|........................................................*.................................... - // mls v13.4s, v24.4s, v8.4s // ................|............................................................*................................ - // srshr v24.4S, v14.4S, #23 // ................|.........................................................*................................... - // mls v14.4s, v24.4s, v8.4s // ................|.............................................................*............................... - // srshr v24.4S, v15.4S, #23 // ................|..............................................................*.............................. - // mls v15.4s, v24.4s, v8.4s // ................|.................................................................*........................... - // srshr v24.4S, v16.4S, #23 // ................|...............................................................*............................. - // mls v16.4s, v24.4s, v8.4s // ................|..................................................................*.......................... - // str q13, [x0, #(4*(1024/8))] // ................|................................................................*............................ - // str q14, [x0, #(5*(1024/8))] // ................|...................................................................*......................... - // str q15, [x0, #(6*(1024/8))] // ................|.....................................................................*....................... - // str q16, [x0, #(7*(1024/8))] // ................|.......................................................................*..................... - // mul v13.4s, v9.4s, v25.4s // ................|............................*................................................................ - // sqrdmulh v9.4s, v9.4s, v26.4s // ................|.............................*............................................................... - // mls v13.4s, v9.4s, v8.s[0] // ................|..........................................................*.................................. - // mul v14.4s, v10.4s, v25.4s // ................|....................................................................*........................ - // sqrdmulh v10.4s, v10.4s, v26.4s // ................|......................................................................*...................... - // mls v14.4s, v10.4s, v8.s[0] // ................|..........................................................................*.................. - // mul v15.4s, v11.4s, v25.4s // ................|........................................................................*.................... - // sqrdmulh v11.4s, v11.4s, v26.4s // ................|.........................................................................*................... - // mls v15.4s, v11.4s, v8.s[0] // ................|.............................................................................*............... - // mul v16.4s, v12.4s, v25.4s // ................|...........................................................................*................. - // sqrdmulh v12.4s, v12.4s, v26.4s // ................|............................................................................*................ - // mls v16.4s, v12.4s, v8.s[0] // .*..............|................................................................................*............ - // str q13, [x0], #(16) // ................|..............................................................................*.............. - // str q14, [x0, #(-16 + 1*(1024/8))] // ..*.............|.................................................................................*........... - // str q15, [x0, #(-16 + 2*(1024/8))] // ....*...........|...................................................................................*......... - // str q16, [x0, #(-16 + 3*(1024/8))] // .............*..|............................................................................................* + // ldr q9, [x0, #0] // e...............|.......................................................................................................e............. + // ldr q10, [x0, #(1*(1024/8))] // ...e............|..........................................................................................................e.......... + // ldr q11, [x0, #(2*(1024/8))] // .....e..........|............................................................................................................e........ + // ldr q12, [x0, #(3*(1024/8))] // ...........e....|..................................................................................................................e.. + // ldr q13, [x0, #(4*(1024/8))] // ......e.........|.............................................................................................................e....... + // ldr q14, [x0, #(5*(1024/8))] // .......e........|..............................................................................................................e...... + // ldr q15, [x0, #(6*(1024/8))] // ........e.......|...............................................................................................................e..... + // ldr q16, [x0, #(7*(1024/8))] // ..........e.....|.................................................................................................................e... + // sub v24.4s, v9.4s, v10.4s // ................*..................................................................................................................... + // add v9.4s, v9.4s, v10.4s // ................|*.................................................................................................................... + // mul v10.4s, v24.4s, v1.s[2] // ................|..*.................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ................|...*................................................................................................................. + // mls v10.4s, v24.4s, v8.s[0] // ................|........*............................................................................................................ + // sub v24.4s, v11.4s, v12.4s // ................|.*................................................................................................................... + // add v11.4s, v11.4s, v12.4s // ..............e.|..................................................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ................|......*.............................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................|.......*............................................................................................................. + // mls v12.4s, v24.4s, v8.s[0] // ................|..............*...................................................................................................... + // sub v24.4s, v13.4s, v14.4s // ................|.........*........................................................................................................... + // add v13.4s, v13.4s, v14.4s // .........e......|................................................................................................................e.... + // mul v14.4s, v24.4s, v2.s[2] // ................|...............*..................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................|................*.................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ................|....................*................................................................................................ + // sub v24.4s, v15.4s, v16.4s // ................|.................*................................................................................................... + // add v15.4s, v15.4s, v16.4s // ............e...|...................................................................................................................e. + // mul v16.4s, v24.4s, v3.s[0] // ................|.....................*............................................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................|.......................*............................................................................................. + // mls v16.4s, v24.4s, v8.s[0] // ................|..............................*...................................................................................... + // sub v24.4s, v9.4s, v11.4s // ................|....*................................................................................................................ + // add v9.4s, v9.4s, v11.4s // ................|.....*............................................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ................|..........*.......................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|...........*......................................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ................|......................*.............................................................................................. + // sub v24.4s, v10.4s, v12.4s // ................|..................*.................................................................................................. + // add v10.4s, v10.4s, v12.4s // ................|...................*................................................................................................. + // mul v12.4s, v24.4s, v0.s[2] // ................|........................*............................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|.........................*........................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ................|...............................*..................................................................................... + // sub v24.4s, v13.4s, v15.4s // ................|................................*.................................................................................... + // add v13.4s, v13.4s, v15.4s // ...............e|..................................................................................................................... + // mul v15.4s, v24.4s, v1.s[0] // ................|...................................*................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|....................................*................................................................................ + // mls v15.4s, v24.4s, v8.s[0] // ................|..........................................*.......................................................................... + // sub v24.4s, v14.4s, v16.4s // ................|..................................*.................................................................................. + // add v14.4s, v14.4s, v16.4s // ................|.....................................*............................................................................... + // mul v16.4s, v24.4s, v1.s[0] // ................|......................................*.............................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|.......................................*............................................................................. + // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................*......................................................................... + // sub v24.4s, v9.4s, v13.4s // ................|............*........................................................................................................ + // add v9.4s, v9.4s, v13.4s // ................|.............*....................................................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ................|..........................*.......................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...........................*......................................................................................... + // mls v13.4s, v24.4s, v8.s[0] // ................|.................................*................................................................................... + // sub v24.4s, v10.4s, v14.4s // ................|........................................*............................................................................ + // add v10.4s, v10.4s, v14.4s // ................|.........................................*........................................................................... + // mul v14.4s, v24.4s, v0.s[0] // ................|............................................*........................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|.............................................*....................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ................|.................................................*................................................................... + // sub v24.4s, v11.4s, v15.4s // ................|..............................................*...................................................................... + // add v11.4s, v11.4s, v15.4s // ................|...............................................*..................................................................... + // mul v15.4s, v24.4s, v0.s[0] // ................|..................................................*.................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...................................................*................................................................. + // mls v15.4s, v24.4s, v8.s[0] // ................|.......................................................*............................................................. + // sub v24.4s, v12.4s, v16.4s // ................|................................................*.................................................................... + // add v12.4s, v12.4s, v16.4s // ................|....................................................*................................................................ + // mul v16.4s, v24.4s, v0.s[0] // ................|.....................................................*............................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|......................................................*.............................................................. + // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................................*......................................................... + // cmge v27.4s, v31.4s, v13.4s // ................|........................................................*............................................................ + // cmge v28.4s, v13.4s, v30.4s // ................|.........................................................*........................................................... + // sub v28.4s, v27.4s, v28.4s // ................|............................................................*........................................................ + // mls v13.4s, v28.4s, v8.4s // ................|...............................................................*..................................................... + // cmge v27.4s, v31.4s, v14.4s // ................|.............................................................*....................................................... + // cmge v28.4s, v14.4s, v30.4s // ................|..............................................................*...................................................... + // sub v28.4s, v27.4s, v28.4s // ................|................................................................*.................................................... + // mls v14.4s, v28.4s, v8.4s // ................|...................................................................*................................................. + // cmge v27.4s, v31.4s, v15.4s // ................|.................................................................*................................................... + // cmge v28.4s, v15.4s, v30.4s // ................|..................................................................*.................................................. + // sub v28.4s, v27.4s, v28.4s // ................|....................................................................*................................................ + // mls v15.4s, v28.4s, v8.4s // ................|.......................................................................*............................................. + // cmge v27.4s, v31.4s, v16.4s // ................|.....................................................................*............................................... + // cmge v28.4s, v16.4s, v30.4s // ................|......................................................................*.............................................. + // sub v28.4s, v27.4s, v28.4s // ................|........................................................................*............................................ + // mls v16.4s, v28.4s, v8.4s // ................|...........................................................................*......................................... + // str q13, [x0, #(4*(1024/8))] // ................|.........................................................................*........................................... + // str q14, [x0, #(5*(1024/8))] // ................|............................................................................*........................................ + // str q15, [x0, #(6*(1024/8))] // ................|..............................................................................*...................................... + // str q16, [x0, #(7*(1024/8))] // ................|................................................................................*.................................... + // mul v13.4s, v9.4s, v25.4s // ................|............................*........................................................................................ + // sqrdmulh v9.4s, v9.4s, v26.4s // ................|.............................*....................................................................................... + // mls v13.4s, v9.4s, v8.s[0] // ................|..........................................................*.......................................................... + // mul v14.4s, v10.4s, v25.4s // ................|..........................................................................*.......................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ................|.............................................................................*....................................... + // mls v14.4s, v10.4s, v8.s[0] // ................|.................................................................................*................................... + // mul v15.4s, v11.4s, v25.4s // ................|...............................................................................*..................................... + // sqrdmulh v11.4s, v11.4s, v26.4s // ................|..................................................................................*.................................. + // mls v15.4s, v11.4s, v8.s[0] // ................|......................................................................................*.............................. + // mul v16.4s, v12.4s, v25.4s // ................|...................................................................................*................................. + // sqrdmulh v12.4s, v12.4s, v26.4s // ................|....................................................................................*................................ + // mls v16.4s, v12.4s, v8.s[0] // ................|........................................................................................*............................ + // cmge v27.4s, v31.4s, v13.4s // ................|.....................................................................................*............................... + // cmge v28.4s, v13.4s, v30.4s // ................|.......................................................................................*............................. + // sub v28.4s, v27.4s, v28.4s // ................|.........................................................................................*........................... + // mls v13.4s, v28.4s, v8.4s // ................|............................................................................................*........................ + // cmge v27.4s, v31.4s, v14.4s // ................|..........................................................................................*.......................... + // cmge v28.4s, v14.4s, v30.4s // ................|...........................................................................................*......................... + // sub v28.4s, v27.4s, v28.4s // ................|.............................................................................................*....................... + // mls v14.4s, v28.4s, v8.4s // ................|................................................................................................*.................... + // cmge v27.4s, v31.4s, v15.4s // ................|..............................................................................................*...................... + // cmge v28.4s, v15.4s, v30.4s // ................|...............................................................................................*..................... + // sub v28.4s, v27.4s, v28.4s // ................|.................................................................................................*................... + // mls v15.4s, v28.4s, v8.4s // ................|....................................................................................................*................ + // cmge v27.4s, v31.4s, v16.4s // ................|..................................................................................................*.................. + // cmge v28.4s, v16.4s, v30.4s // ................|...................................................................................................*................. + // sub v28.4s, v27.4s, v28.4s // ................|.....................................................................................................*............... + // mls v16.4s, v28.4s, v8.4s // .*..............|........................................................................................................*............ + // str q13, [x0], #(16) // ................|......................................................................................................*.............. + // str q14, [x0, #(-16 + 1*(1024/8))] // ..*.............|.........................................................................................................*........... + // str q15, [x0, #(-16 + 2*(1024/8))] // ....*...........|...........................................................................................................*......... + // str q16, [x0, #(-16 + 3*(1024/8))] // .............*..|....................................................................................................................* sub count, count, #1 cbnz count, layer123_start - sub v27.4S, v4.4S, v11.4S // ..........*......................................................................... - // gap // .................................................................................... - sub v22.4S, v17.4S, v23.4S // *................................................................................... - // gap // .................................................................................... - sub v16.4S, v13.4S, v7.4S // ..................*................................................................. - // gap // .................................................................................... - mul v7.4S, v27.4S, v2.S[2] // ................*................................................................... - // gap // .................................................................................... - mul v29.4S, v22.4S, v1.S[2] // ...*................................................................................ - // gap // .................................................................................... - sqrdmulh v13.4S, v22.4S, v1.S[3] // ....*............................................................................... - // gap // .................................................................................... - sqrdmulh v5.4S, v16.4S, v3.S[1] // ........................*........................................................... - // gap // .................................................................................... - sub v18.4S, v21.4S, v20.4S // ..*................................................................................. - // gap // .................................................................................... - sqrdmulh v4.4S, v27.4S, v2.S[3] // .................*.................................................................. - // gap // .................................................................................... - mul v14.4S, v16.4S, v3.S[0] // ......................*............................................................. - // gap // .................................................................................... - mul v11.4S, v18.4S, v2.S[0] // .......*............................................................................ - // gap // .................................................................................... - sqrdmulh v16.4S, v18.4S, v2.S[1] // ........*........................................................................... - // gap // .................................................................................... - mls v7.4S, v4.4S, v8.S[0] // .....................*.............................................................. - // gap // .................................................................................... - mls v14.4S, v5.4S, v8.S[0] // ...............................*.................................................... - // gap // .................................................................................... - mls v29.4S, v13.4S, v8.S[0] // .........*.......................................................................... - // gap // .................................................................................... - mls v11.4S, v16.4S, v8.S[0] // ...............*.................................................................... - // gap // .................................................................................... - sub v16.4S, v19.4S, v6.4S // .................................*.................................................. - // gap // .................................................................................... - sub v9.4S, v7.4S, v14.4S // ...................................*................................................ - // gap // .................................................................................... - add v27.4S, v7.4S, v14.4S // ......................................*............................................. - // gap // .................................................................................... - add v5.4S, v29.4S, v11.4S // ....................*............................................................... - // gap // .................................................................................... - sqrdmulh v20.4S, v9.4S, v1.S[1] // ........................................*........................................... - // gap // .................................................................................... - mul v6.4S, v9.4S, v1.S[0] // .......................................*............................................ - // gap // .................................................................................... - sub v21.4S, v5.4S, v27.4S // .........................................*.......................................... - // gap // .................................................................................... - add v22.4S, v5.4S, v27.4S // ..........................................*......................................... - // gap // .................................................................................... - sub v27.4S, v29.4S, v11.4S // ...................*................................................................ - // gap // .................................................................................... - sqrdmulh v7.4S, v21.4S, v0.S[1] // ..............................................*..................................... - // gap // .................................................................................... - mul v4.4S, v21.4S, v0.S[0] // .............................................*...................................... - // gap // .................................................................................... - mls v6.4S, v20.4S, v8.S[0] // ............................................*....................................... - // gap // .................................................................................... - sqrdmulh v11.4S, v27.4S, v0.S[3] // ..........................*......................................................... - // gap // .................................................................................... - sqrdmulh v28.4S, v22.4S, v26.4S // .......................................................................*............ - // gap // .................................................................................... - mls v4.4S, v7.4S, v8.S[0] // ..................................................*................................. - // gap // .................................................................................... - mul v13.4S, v27.4S, v0.S[2] // .........................*.......................................................... - // gap // .................................................................................... - sqrdmulh v7.4S, v16.4S, v1.S[1] // .....................................*.............................................. - // gap // .................................................................................... - add v27.4S, v17.4S, v23.4S // .*.................................................................................. - // gap // .................................................................................... - srshr v15.4S, v4.4S, #23 // ..........................................................*......................... - // gap // .................................................................................... - mls v13.4S, v11.4S, v8.S[0] // ................................*................................................... - // gap // .................................................................................... - mul v11.4S, v22.4S, v25.4S // .....................................................................*.............. - // gap // .................................................................................... - mls v4.4S, v15.4S, v8.4S // ..............................................................*..................... - // gap // .................................................................................... - sub v29.4S, v27.4S, v30.4S // .....*.............................................................................. - // gap // .................................................................................... - sub v20.4S, v13.4S, v6.4S // .................................................*.................................. - // gap // .................................................................................... - mls v11.4S, v28.4S, v8.S[0] // ...........................................................................*........ - // gap // .................................................................................... - str q4, [x0, #640] // ....................................................................*............... - // gap // .................................................................................... - sqrdmulh v14.4S, v20.4S, v0.S[1] // .......................................................*............................ - // gap // .................................................................................... - mul v21.4S, v20.4S, v0.S[0] // ......................................................*............................. - // gap // .................................................................................... - str q11, [x0, #128] // .................................................................................*.. - // gap // .................................................................................... - sqrdmulh v17.4S, v29.4S, v0.S[3] // ............*....................................................................... - // gap // .................................................................................... - mul v18.4S, v16.4S, v1.S[0] // ....................................*............................................... - // gap // .................................................................................... - mls v21.4S, v14.4S, v8.S[0] // ............................................................*....................... - // gap // .................................................................................... - mul v29.4S, v29.4S, v0.S[2] // ...........*........................................................................ - // gap // .................................................................................... - add v23.4S, v13.4S, v6.4S // .....................................................*.............................. - // gap // .................................................................................... - mls v18.4S, v7.4S, v8.S[0] // ...........................................*........................................ - // gap // .................................................................................... - srshr v28.4S, v21.4S, #23 // ................................................................*................... - // gap // .................................................................................... - mls v29.4S, v17.4S, v8.S[0] // .......................*............................................................ - // gap // .................................................................................... - sqrdmulh v12.4S, v23.4S, v26.4S // .............................................................................*...... - // gap // .................................................................................... - mls v21.4S, v28.4S, v8.4S // ...................................................................*................ - // gap // .................................................................................... - add v15.4S, v27.4S, v30.4S // ......*............................................................................. - // gap // .................................................................................... - add v17.4S, v29.4S, v18.4S // ................................................*................................... - // gap // .................................................................................... - sub v20.4S, v29.4S, v18.4S // ...............................................*.................................... - // gap // .................................................................................... - str q21, [x0, #896] // ........................................................................*........... - // gap // .................................................................................... - sqrdmulh v16.4S, v17.4S, v26.4S // ..........................................................................*......... - // gap // .................................................................................... - mul v17.4S, v17.4S, v25.4S // .........................................................................*.......... - // gap // .................................................................................... - sqrdmulh v6.4S, v20.4S, v0.S[1] // ....................................................*............................... - // gap // .................................................................................... - mul v30.4S, v20.4S, v0.S[0] // ...................................................*................................ - // gap // .................................................................................... - sub v20.4S, v15.4S, v10.4S // .............*...................................................................... - // gap // .................................................................................... - mls v17.4S, v16.4S, v8.S[0] // ..............................................................................*..... - // gap // .................................................................................... - add v18.4S, v15.4S, v10.4S // ..............*..................................................................... - // gap // .................................................................................... - mls v30.4S, v6.4S, v8.S[0] // ........................................................*........................... - // gap // .................................................................................... - mul v21.4S, v20.4S, v0.S[0] // ...........................*........................................................ - // gap // .................................................................................... - str q17, [x0, #256] // ..................................................................................*. - // gap // .................................................................................... - sqrdmulh v24.4S, v20.4S, v0.S[1] // ............................*....................................................... - // gap // .................................................................................... - srshr v19.4S, v30.4S, #23 // ...............................................................*.................... - // gap // .................................................................................... - sqrdmulh v20.4S, v18.4S, v26.4S // ..............................*..................................................... - // gap // .................................................................................... - mul v5.4S, v18.4S, v25.4S // .............................*...................................................... - // gap // .................................................................................... - mls v30.4S, v19.4S, v8.4S // ..................................................................*................. - // gap // .................................................................................... - mls v21.4S, v24.4S, v8.S[0] // ..................................*................................................. - // gap // .................................................................................... - mul v15.4S, v23.4S, v25.4S // ............................................................................*....... - // gap // .................................................................................... - mls v5.4S, v20.4S, v8.S[0] // ...........................................................*........................ - // gap // .................................................................................... - str q30, [x0, #768] // ......................................................................*............. - // gap // .................................................................................... - srshr v16.4S, v21.4S, #23 // .........................................................*.......................... - // gap // .................................................................................... - mls v15.4S, v12.4S, v8.S[0] // ................................................................................*... - // gap // .................................................................................... - str q5, [x0], #(16) // ...............................................................................*.... - // gap // .................................................................................... - mls v21.4S, v16.4S, v8.4S // .............................................................*...................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - str q15, [x0, #368] // ...................................................................................* - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - str q21, [x0, #496] // .................................................................*.................. - // gap // .................................................................................... + sub v20.4S, v11.4S, v20.4S // ..*......................................................................................................... + // gap // ............................................................................................................ + sub v5.4S, v18.4S, v14.4S // *........................................................................................................... + // gap // ............................................................................................................ + sub v11.4S, v28.4S, v29.4S // ..........*................................................................................................. + // gap // ............................................................................................................ + sqrdmulh v29.4S, v20.4S, v2.S[1] // ........*................................................................................................... + // gap // ............................................................................................................ + mul v17.4S, v20.4S, v2.S[0] // .......*.................................................................................................... + // gap // ............................................................................................................ + sub v9.4S, v9.4S, v4.4S // ..................*......................................................................................... + // gap // ............................................................................................................ + sqrdmulh v24.4S, v5.4S, v1.S[3] // ....*....................................................................................................... + // gap // ............................................................................................................ + sqrdmulh v21.4S, v11.4S, v2.S[3] // .................*.......................................................................................... + // gap // ............................................................................................................ + mul v19.4S, v9.4S, v3.S[0] // ......................*..................................................................................... + // gap // ............................................................................................................ + mul v27.4S, v5.4S, v1.S[2] // ...*........................................................................................................ + // gap // ............................................................................................................ + mul v28.4S, v11.4S, v2.S[2] // ................*........................................................................................... + // gap // ............................................................................................................ + sqrdmulh v6.4S, v9.4S, v3.S[1] // ........................*................................................................................... + // gap // ............................................................................................................ + mls v17.4S, v29.4S, v8.S[0] // ...............*............................................................................................ + // gap // ............................................................................................................ + mls v27.4S, v24.4S, v8.S[0] // .........*.................................................................................................. + // gap // ............................................................................................................ + mls v28.4S, v21.4S, v8.S[0] // .....................*...................................................................................... + // gap // ............................................................................................................ + mls v19.4S, v6.4S, v8.S[0] // ...............................*............................................................................ + // gap // ............................................................................................................ + add v16.4S, v18.4S, v14.4S // .*.......................................................................................................... + // gap // ............................................................................................................ + sub v18.4S, v27.4S, v17.4S // ...................*........................................................................................ + // gap // ............................................................................................................ + add v7.4S, v27.4S, v17.4S // ....................*....................................................................................... + // gap // ............................................................................................................ + sub v17.4S, v28.4S, v19.4S // ...................................*........................................................................ + // gap // ............................................................................................................ + sqrdmulh v14.4S, v18.4S, v0.S[3] // ..........................*................................................................................. + // gap // ............................................................................................................ + mul v4.4S, v18.4S, v0.S[2] // .........................*.................................................................................. + // gap // ............................................................................................................ + sqrdmulh v18.4S, v17.4S, v1.S[1] // ........................................*................................................................... + // gap // ............................................................................................................ + mul v11.4S, v17.4S, v1.S[0] // .......................................*.................................................................... + // gap // ............................................................................................................ + add v27.4S, v28.4S, v19.4S // ......................................*..................................................................... + // gap // ............................................................................................................ + sub v28.4S, v16.4S, v15.4S // .....*...................................................................................................... + // gap // ............................................................................................................ + mls v4.4S, v14.4S, v8.S[0] // ................................*........................................................................... + // gap // ............................................................................................................ + mls v11.4S, v18.4S, v8.S[0] // ............................................*............................................................... + // gap // ............................................................................................................ + mul v20.4S, v28.4S, v0.S[2] // ...........*................................................................................................ + // gap // ............................................................................................................ + add v19.4S, v7.4S, v27.4S // ..........................................*................................................................. + // gap // ............................................................................................................ + sqrdmulh v21.4S, v28.4S, v0.S[3] // ............*............................................................................................... + // gap // ............................................................................................................ + add v17.4S, v4.4S, v11.4S // .....................................................*...................................................... + // gap // ............................................................................................................ + mul v28.4S, v19.4S, v25.4S // ...........................................................................*................................ + // gap // ............................................................................................................ + sub v14.4S, v13.4S, v22.4S // .................................*.......................................................................... + // gap // ............................................................................................................ + mul v9.4S, v17.4S, v25.4S // ....................................................................................*....................... + // gap // ............................................................................................................ + sqrdmulh v17.4S, v17.4S, v26.4S // .....................................................................................*...................... + // gap // ............................................................................................................ + mls v20.4S, v21.4S, v8.S[0] // .......................*.................................................................................... + // gap // ............................................................................................................ + sqrdmulh v18.4S, v14.4S, v1.S[1] // .....................................*...................................................................... + // gap // ............................................................................................................ + mul v13.4S, v14.4S, v1.S[0] // ....................................*....................................................................... + // gap // ............................................................................................................ + mls v9.4S, v17.4S, v8.S[0] // .........................................................................................*.................. + // gap // ............................................................................................................ + add v21.4S, v16.4S, v15.4S // ......*..................................................................................................... + // gap // ............................................................................................................ + sqrdmulh v10.4S, v19.4S, v26.4S // ..............................................................................*............................. + // gap // ............................................................................................................ + mls v13.4S, v18.4S, v8.S[0] // ...........................................*................................................................ + // gap // ............................................................................................................ + cmge v17.4S, v9.4S, v30.4S // ....................................................................................................*....... + // gap // ............................................................................................................ + cmge v19.4S, v31.4S, v9.4S // ...................................................................................................*........ + // gap // ............................................................................................................ + add v18.4S, v21.4S, v23.4S // ..............*............................................................................................. + // gap // ............................................................................................................ + sub v12.4S, v19.4S, v17.4S // ......................................................................................................*..... + // gap // ............................................................................................................ + add v17.4S, v20.4S, v13.4S // ................................................*........................................................... + // gap // ............................................................................................................ + sqrdmulh v16.4S, v18.4S, v26.4S // ..............................*............................................................................. + // gap // ............................................................................................................ + mls v9.4S, v12.4S, v8.4S // ........................................................................................................*... + // gap // ............................................................................................................ + mul v6.4S, v17.4S, v25.4S // ................................................................................*........................... + // gap // ............................................................................................................ + sqrdmulh v14.4S, v17.4S, v26.4S // ...................................................................................*........................ + // gap // ............................................................................................................ + mls v28.4S, v10.4S, v8.S[0] // ..................................................................................*......................... + // gap // ............................................................................................................ + mul v15.4S, v18.4S, v25.4S // .............................*.............................................................................. + // gap // ............................................................................................................ + sub v4.4S, v4.4S, v11.4S // .................................................*.......................................................... + // gap // ............................................................................................................ + mls v6.4S, v14.4S, v8.S[0] // .......................................................................................*.................... + // gap // ............................................................................................................ + cmge v11.4S, v31.4S, v28.4S // ...........................................................................................*................ + // gap // ............................................................................................................ + cmge v17.4S, v28.4S, v30.4S // ............................................................................................*............... + // gap // ............................................................................................................ + mls v15.4S, v16.4S, v8.S[0] // ...........................................................*................................................ + // gap // ............................................................................................................ + sub v18.4S, v11.4S, v17.4S // ..............................................................................................*............. + // gap // ............................................................................................................ + cmge v17.4S, v6.4S, v30.4S // ................................................................................................*........... + // gap // ............................................................................................................ + cmge v19.4S, v31.4S, v6.4S // ...............................................................................................*............ + // gap // ............................................................................................................ + mls v28.4S, v18.4S, v8.4S // .................................................................................................*.......... + // gap // ............................................................................................................ + cmge v18.4S, v31.4S, v15.4S // ......................................................................................*..................... + // gap // ............................................................................................................ + cmge v29.4S, v15.4S, v30.4S // ........................................................................................*................... + // gap // ............................................................................................................ + sub v14.4S, v20.4S, v13.4S // ...............................................*............................................................ + // gap // ............................................................................................................ + sub v29.4S, v18.4S, v29.4S // ..........................................................................................*................. + // gap // ............................................................................................................ + sqrdmulh v18.4S, v4.4S, v0.S[1] // .......................................................*.................................................... + // gap // ............................................................................................................ + mul v11.4S, v4.4S, v0.S[0] // ......................................................*..................................................... + // gap // ............................................................................................................ + mls v15.4S, v29.4S, v8.4S // .............................................................................................*.............. + // gap // ............................................................................................................ + sub v4.4S, v7.4S, v27.4S // .........................................*.................................................................. + // gap // ............................................................................................................ + sqrdmulh v29.4S, v14.4S, v0.S[1] // ....................................................*....................................................... + // gap // ............................................................................................................ + mls v11.4S, v18.4S, v8.S[0] // ............................................................*............................................... + // gap // ............................................................................................................ + sqrdmulh v22.4S, v4.4S, v0.S[1] // ..............................................*............................................................. + // gap // ............................................................................................................ + mul v20.4S, v4.4S, v0.S[0] // .............................................*.............................................................. + // gap // ............................................................................................................ + mul v4.4S, v14.4S, v0.S[0] // ...................................................*........................................................ + // gap // ............................................................................................................ + cmge v18.4S, v31.4S, v11.4S // ......................................................................*..................................... + // gap // ............................................................................................................ + cmge v14.4S, v11.4S, v30.4S // .......................................................................*.................................... + // gap // ............................................................................................................ + mls v20.4S, v22.4S, v8.S[0] // ..................................................*......................................................... + // gap // ............................................................................................................ + sub v24.4S, v18.4S, v14.4S // .........................................................................*.................................. + // gap // ............................................................................................................ + mls v4.4S, v29.4S, v8.S[0] // ........................................................*................................................... + // gap // ............................................................................................................ + sub v13.4S, v21.4S, v23.4S // .............*.............................................................................................. + // gap // ............................................................................................................ + mls v11.4S, v24.4S, v8.4S // ............................................................................*............................... + // gap // ............................................................................................................ + cmge v14.4S, v31.4S, v20.4S // ..............................................................*............................................. + // gap // ............................................................................................................ + cmge v22.4S, v31.4S, v4.4S // ..................................................................*......................................... + // gap // ............................................................................................................ + cmge v18.4S, v20.4S, v30.4S // ...............................................................*............................................ + // gap // ............................................................................................................ + str q11, [x0, #896] // .................................................................................*.......................... + // gap // ............................................................................................................ + sub v11.4S, v14.4S, v18.4S // .................................................................*.......................................... + // gap // ............................................................................................................ + cmge v18.4S, v4.4S, v30.4S // ...................................................................*........................................ + // gap // ............................................................................................................ + str q15, [x0], #(16) // .......................................................................................................*.... + // gap // ............................................................................................................ + mul v14.4S, v13.4S, v0.S[0] // ...........................*................................................................................ + // gap // ............................................................................................................ + sqrdmulh v27.4S, v13.4S, v0.S[1] // ............................*............................................................................... + // gap // ............................................................................................................ + sub v18.4S, v22.4S, v18.4S // .....................................................................*...................................... + // gap // ............................................................................................................ + mls v20.4S, v11.4S, v8.4S // ....................................................................*....................................... + // gap // ............................................................................................................ + str q28, [x0, #112] // .........................................................................................................*.. + // gap // ............................................................................................................ + mls v14.4S, v27.4S, v8.S[0] // ..................................*......................................................................... + // gap // ............................................................................................................ + mls v4.4S, v18.4S, v8.4S // ........................................................................*................................... + // gap // ............................................................................................................ + str q20, [x0, #624] // .............................................................................*.............................. + // gap // ............................................................................................................ + sub v17.4S, v19.4S, v17.4S // ..................................................................................................*......... + // gap // ............................................................................................................ + cmge v19.4S, v14.4S, v30.4S // ..........................................................*................................................. + // gap // ............................................................................................................ + cmge v18.4S, v31.4S, v14.4S // .........................................................*.................................................. + // gap // ............................................................................................................ + str q9, [x0, #368] // ...........................................................................................................* + // gap // ............................................................................................................ + sub v19.4S, v18.4S, v19.4S // .............................................................*.............................................. + // gap // ............................................................................................................ + mls v6.4S, v17.4S, v8.4S // .....................................................................................................*...... + // gap // ............................................................................................................ + str q4, [x0, #752] // ...............................................................................*............................ + // gap // ............................................................................................................ + mls v14.4S, v19.4S, v8.4S // ................................................................*........................................... + // gap // ............................................................................................................ + // gap // ............................................................................................................ + // gap // ............................................................................................................ + str q6, [x0, #240] // ..........................................................................................................*. + // gap // ............................................................................................................ + // gap // ............................................................................................................ + // gap // ............................................................................................................ + str q14, [x0, #496] // ..........................................................................*................................. + // gap // ............................................................................................................ // original source code - // sub v16.4S, v17.4S, v23.4S // .*.................................................................................. - // add v23.4S, v17.4S, v23.4S // .................................*.................................................. - // sub v21.4S, v21.4S, v20.4S // .......*............................................................................ - // mul v20.4S, v16.4S, v1.S[2] // ....*............................................................................... - // sqrdmulh v16.4S, v16.4S, v1.S[3] // .....*.............................................................................. - // sub v17.4S, v23.4S, v30.4S // ......................................*............................................. - // add v23.4S, v23.4S, v30.4S // .......................................................*............................ - // mul v30.4S, v21.4S, v2.S[0] // ..........*......................................................................... - // sqrdmulh v21.4S, v21.4S, v2.S[1] // ...........*........................................................................ - // mls v20.4S, v16.4S, v8.S[0] // ..............*..................................................................... - // sub v16.4S, v4.4S, v11.4S // *................................................................................... - // mul v4.4S, v17.4S, v0.S[2] // ................................................*................................... - // sqrdmulh v17.4S, v17.4S, v0.S[3] // .............................................*...................................... - // sub v11.4S, v23.4S, v10.4S // ...............................................................*.................... - // add v23.4S, v23.4S, v10.4S // .................................................................*.................. - // mls v30.4S, v21.4S, v8.S[0] // ...............*.................................................................... - // mul v21.4S, v16.4S, v2.S[2] // ...*................................................................................ - // sqrdmulh v16.4S, v16.4S, v2.S[3] // ........*........................................................................... - // sub v13.4S, v13.4S, v7.4S // ..*................................................................................. - // sub v7.4S, v20.4S, v30.4S // ........................*........................................................... - // add v20.4S, v20.4S, v30.4S // ...................*................................................................ - // mls v21.4S, v16.4S, v8.S[0] // ............*....................................................................... - // mul v16.4S, v13.4S, v3.S[0] // .........*.......................................................................... - // mls v4.4S, v17.4S, v8.S[0] // ....................................................*............................... - // sqrdmulh v17.4S, v13.4S, v3.S[1] // ......*............................................................................. - // mul v30.4S, v7.4S, v0.S[2] // ...............................*.................................................... - // sqrdmulh v13.4S, v7.4S, v0.S[3] // ............................*....................................................... - // mul v7.4S, v11.4S, v0.S[0] // ...................................................................*................ - // sqrdmulh v11.4S, v11.4S, v0.S[1] // .....................................................................*.............. - // mul v10.4S, v23.4S, v25.4S // ........................................................................*........... - // sqrdmulh v23.4S, v23.4S, v26.4S // .......................................................................*............ - // mls v16.4S, v17.4S, v8.S[0] // .............*...................................................................... - // mls v30.4S, v13.4S, v8.S[0] // ...................................*................................................ - // sub v17.4S, v19.4S, v6.4S // ................*................................................................... - // mls v7.4S, v11.4S, v8.S[0] // ..........................................................................*......... - // sub v11.4S, v21.4S, v16.4S // .................*.................................................................. - // mul v13.4S, v17.4S, v1.S[0] // ..............................................*..................................... - // sqrdmulh v17.4S, v17.4S, v1.S[1] // ................................*................................................... - // add v16.4S, v21.4S, v16.4S // ..................*................................................................. - // mul v21.4S, v11.4S, v1.S[0] // .....................*.............................................................. - // sqrdmulh v11.4S, v11.4S, v1.S[1] // ....................*............................................................... - // sub v19.4S, v20.4S, v16.4S // ......................*............................................................. - // add v16.4S, v20.4S, v16.4S // .......................*............................................................ - // mls v13.4S, v17.4S, v8.S[0] // ..................................................*................................. - // mls v21.4S, v11.4S, v8.S[0] // ...........................*........................................................ - // mul v20.4S, v19.4S, v0.S[0] // ..........................*......................................................... - // sqrdmulh v17.4S, v19.4S, v0.S[1] // .........................*.......................................................... - // sub v11.4S, v4.4S, v13.4S // .........................................................*.......................... - // add v4.4S, v4.4S, v13.4S // ........................................................*........................... - // sub v13.4S, v30.4S, v21.4S // .......................................*............................................ - // mls v20.4S, v17.4S, v8.S[0] // ..............................*..................................................... - // mul v17.4S, v11.4S, v0.S[0] // ..............................................................*..................... - // sqrdmulh v11.4S, v11.4S, v0.S[1] // .............................................................*...................... - // add v21.4S, v30.4S, v21.4S // .................................................*.................................. - // mul v30.4S, v13.4S, v0.S[0] // ...........................................*........................................ - // sqrdmulh v13.4S, v13.4S, v0.S[1] // ..........................................*......................................... - // mls v17.4S, v11.4S, v8.S[0] // ..................................................................*................. - // srshr v11.4S, v7.4S, #23 // ..............................................................................*..... - // srshr v19.4S, v20.4S, #23 // ..................................*................................................. - // mls v10.4S, v23.4S, v8.S[0] // ............................................................................*....... - // mls v30.4S, v13.4S, v8.S[0] // ...............................................*.................................... - // mls v7.4S, v11.4S, v8.4S // .................................................................................*.. - // mls v20.4S, v19.4S, v8.4S // .....................................*.............................................. - // srshr v23.4S, v17.4S, #23 // ......................................................................*............. - // srshr v11.4S, v30.4S, #23 // ...................................................*................................ - // str q7, [x0, #512] // ...................................................................................* - // mls v17.4S, v23.4S, v8.4S // .........................................................................*.......... - // mls v30.4S, v11.4S, v8.4S // ......................................................*............................. - // str q20, [x0, #640] // .........................................*.......................................... - // mul v23.4S, v16.4S, v25.4S // ....................................*............................................... - // str q17, [x0, #768] // .............................................................................*...... - // sqrdmulh v16.4S, v16.4S, v26.4S // .............................*...................................................... - // str q30, [x0, #896] // ..........................................................*......................... - // mul v20.4S, v4.4S, v25.4S // ............................................................*....................... - // sqrdmulh v17.4S, v4.4S, v26.4S // ...........................................................*........................ - // mls v23.4S, v16.4S, v8.S[0] // ........................................*........................................... - // mul v16.4S, v21.4S, v25.4S // ...........................................................................*........ - // sqrdmulh v21.4S, v21.4S, v26.4S // .....................................................*.............................. - // mls v20.4S, v17.4S, v8.S[0] // ................................................................*................... - // str q10, [x0], #(16) // ................................................................................*... - // mls v16.4S, v21.4S, v8.S[0] // ...............................................................................*.... - // str q23, [x0, #112] // ............................................*....................................... - // str q20, [x0, #240] // ....................................................................*............... - // str q16, [x0, #368] // ..................................................................................*. + // sub v17.4S, v18.4S, v14.4S // .*.......................................................................................................... + // add v19.4S, v18.4S, v14.4S // ................*........................................................................................... + // sub v18.4S, v11.4S, v20.4S // *........................................................................................................... + // mul v6.4S, v17.4S, v1.S[2] // .........*.................................................................................................. + // sqrdmulh v17.4S, v17.4S, v1.S[3] // ......*..................................................................................................... + // sub v14.4S, v19.4S, v15.4S // .........................*.................................................................................. + // add v19.4S, v19.4S, v15.4S // ........................................*................................................................... + // mul v11.4S, v18.4S, v2.S[0] // ....*....................................................................................................... + // sqrdmulh v18.4S, v18.4S, v2.S[1] // ...*........................................................................................................ + // mls v6.4S, v17.4S, v8.S[0] // .............*.............................................................................................. + // sub v17.4S, v28.4S, v29.4S // ..*......................................................................................................... + // mul v28.4S, v14.4S, v0.S[2] // ............................*............................................................................... + // sqrdmulh v14.4S, v14.4S, v0.S[3] // ..............................*............................................................................. + // sub v29.4S, v19.4S, v23.4S // .................................................................................*.......................... + // add v19.4S, v19.4S, v23.4S // .............................................*.............................................................. + // mls v11.4S, v18.4S, v8.S[0] // ............*............................................................................................... + // mul v18.4S, v17.4S, v2.S[2] // ..........*................................................................................................. + // sqrdmulh v17.4S, v17.4S, v2.S[3] // .......*.................................................................................................... + // sub v9.4S, v9.4S, v4.4S // .....*...................................................................................................... + // sub v4.4S, v6.4S, v11.4S // .................*.......................................................................................... + // add v6.4S, v6.4S, v11.4S // ..................*......................................................................................... + // mls v18.4S, v17.4S, v8.S[0] // ..............*............................................................................................. + // mul v17.4S, v9.4S, v3.S[0] // ........*................................................................................................... + // mls v28.4S, v14.4S, v8.S[0] // ....................................*....................................................................... + // sqrdmulh v14.4S, v9.4S, v3.S[1] // ...........*................................................................................................ + // mul v11.4S, v4.4S, v0.S[2] // .....................*...................................................................................... + // sqrdmulh v9.4S, v4.4S, v0.S[3] // ....................*....................................................................................... + // mul v4.4S, v29.4S, v0.S[0] // ..........................................................................................*................. + // sqrdmulh v29.4S, v29.4S, v0.S[1] // ...........................................................................................*................ + // mul v20.4S, v19.4S, v25.4S // .....................................................*...................................................... + // sqrdmulh v19.4S, v19.4S, v26.4S // ................................................*........................................................... + // mls v17.4S, v14.4S, v8.S[0] // ...............*............................................................................................ + // mls v11.4S, v9.4S, v8.S[0] // ..........................*................................................................................. + // sub v14.4S, v13.4S, v22.4S // .................................*.......................................................................... + // mls v4.4S, v29.4S, v8.S[0] // ...............................................................................................*............ + // sub v29.4S, v18.4S, v17.4S // ...................*........................................................................................ + // mul v9.4S, v14.4S, v1.S[0] // ......................................*..................................................................... + // sqrdmulh v14.4S, v14.4S, v1.S[1] // .....................................*...................................................................... + // add v17.4S, v18.4S, v17.4S // ........................*................................................................................... + // mul v18.4S, v29.4S, v1.S[0] // .......................*.................................................................................... + // sqrdmulh v29.4S, v29.4S, v1.S[1] // ......................*..................................................................................... + // sub v13.4S, v6.4S, v17.4S // ......................................................................*..................................... + // add v17.4S, v6.4S, v17.4S // .............................*.............................................................................. + // mls v9.4S, v14.4S, v8.S[0] // ..........................................*................................................................. + // mls v18.4S, v29.4S, v8.S[0] // ...........................*................................................................................ + // mul v6.4S, v13.4S, v0.S[0] // ..........................................................................*................................. + // sqrdmulh v14.4S, v13.4S, v0.S[1] // .........................................................................*.................................. + // sub v29.4S, v28.4S, v9.4S // .................................................................*.......................................... + // add v28.4S, v28.4S, v9.4S // ...............................................*............................................................ + // sub v9.4S, v11.4S, v18.4S // ......................................................*..................................................... + // mls v6.4S, v14.4S, v8.S[0] // ..............................................................................*............................. + // mul v14.4S, v29.4S, v0.S[0] // ...........................................................................*................................ + // sqrdmulh v29.4S, v29.4S, v0.S[1] // .......................................................................*.................................... + // add v18.4S, v11.4S, v18.4S // ...............................*............................................................................ + // mul v11.4S, v9.4S, v0.S[0] // ....................................................................*....................................... + // sqrdmulh v9.4S, v9.4S, v0.S[1] // ...................................................................*........................................ + // mls v14.4S, v29.4S, v8.S[0] // ................................................................................*........................... + // cmge v29.4S, v31.4S, v4.4S // ....................................................................................................*....... + // cmge v13.4S, v4.4S, v30.4S // ...................................................................................................*........ + // mls v20.4S, v19.4S, v8.S[0] // ..........................................................*................................................. + // mls v11.4S, v9.4S, v8.S[0] // ........................................................................*................................... + // sub v19.4S, v29.4S, v13.4S // ......................................................................................................*..... + // cmge v29.4S, v31.4S, v6.4S // ...................................................................................*........................ + // cmge v9.4S, v6.4S, v30.4S // .....................................................................................*...................... + // mls v4.4S, v19.4S, v8.4S // .........................................................................................................*.. + // sub v19.4S, v29.4S, v9.4S // .......................................................................................*.................... + // cmge v29.4S, v31.4S, v14.4S // ....................................................................................*....................... + // cmge v9.4S, v14.4S, v30.4S // ........................................................................................*................... + // mls v6.4S, v19.4S, v8.4S // .............................................................................................*.............. + // sub v19.4S, v29.4S, v9.4S // ............................................................................................*............... + // cmge v29.4S, v31.4S, v11.4S // ............................................................................*............................... + // cmge v9.4S, v11.4S, v30.4S // .............................................................................*.............................. + // mls v14.4S, v19.4S, v8.4S // ................................................................................................*........... + // sub v19.4S, v29.4S, v9.4S // ...............................................................................*............................ + // str q4, [x0, #512] // ...........................................................................................................* + // mul v29.4S, v17.4S, v25.4S // ................................*........................................................................... + // mls v11.4S, v19.4S, v8.4S // ..................................................................................*......................... + // str q6, [x0, #640] // .................................................................................................*.......... + // sqrdmulh v17.4S, v17.4S, v26.4S // .........................................*.................................................................. + // str q14, [x0, #768] // ........................................................................................................*... + // mul v19.4S, v28.4S, v25.4S // ..................................................*......................................................... + // str q11, [x0, #896] // ......................................................................................*..................... + // mls v29.4S, v17.4S, v8.S[0] // ....................................................*....................................................... + // sqrdmulh v17.4S, v28.4S, v26.4S // ...................................................*........................................................ + // mul v6.4S, v18.4S, v25.4S // ..................................*......................................................................... + // sqrdmulh v18.4S, v18.4S, v26.4S // ...................................*........................................................................ + // cmge v14.4S, v31.4S, v20.4S // ...............................................................*............................................ + // mls v19.4S, v17.4S, v8.S[0] // .......................................................*.................................................... + // cmge v17.4S, v20.4S, v30.4S // ................................................................*........................................... + // mls v6.4S, v18.4S, v8.S[0] // .......................................*.................................................................... + // sub v17.4S, v14.4S, v17.4S // ..................................................................*......................................... + // cmge v18.4S, v31.4S, v29.4S // ........................................................*................................................... + // cmge v14.4S, v29.4S, v30.4S // .........................................................*.................................................. + // mls v20.4S, v17.4S, v8.4S // .....................................................................*...................................... + // sub v17.4S, v18.4S, v14.4S // ...........................................................*................................................ + // cmge v18.4S, v31.4S, v19.4S // .............................................................*.............................................. + // cmge v14.4S, v19.4S, v30.4S // ............................................................*............................................... + // mls v29.4S, v17.4S, v8.4S // ..............................................................*............................................. + // sub v17.4S, v18.4S, v14.4S // ..................................................................................................*......... + // cmge v18.4S, v31.4S, v6.4S // ............................................*............................................................... + // cmge v14.4S, v6.4S, v30.4S // ...........................................*................................................................ + // mls v19.4S, v17.4S, v8.4S // .......................................................................................................*.... + // sub v17.4S, v18.4S, v14.4S // ..............................................*............................................................. + // str q20, [x0], #(16) // .........................................................................................*.................. + // mls v6.4S, v17.4S, v8.4S // .................................................*.......................................................... + // str q29, [x0, #112] // ..............................................................................................*............. + // str q19, [x0, #240] // ..........................................................................................................*. + // str q6, [x0, #368] // .....................................................................................................*...... pop_stack diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a72.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a72.s index 702bac0..11cceb8 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a72.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a72.s @@ -67,7 +67,7 @@ xtmp1 .req x11 cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s - vmls \a, \tmp2, modulus + vmls \a, \tmp2, consts .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -76,12 +76,6 @@ xtmp1 .req x11 mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.4s, \a\().4s, \b\().4s add \a\().4s, \a\().4s, \b\().4s @@ -393,1934 +387,2096 @@ _intt_dilithium_123_45678_opt_a72: qform_root3_tw .req q7 .p2align 2 - ldr q21, [x5, #96] // ........*............................................................................................................................ - ld4 {v26.4S, v27.4S, v28.4S, v29.4S}, [x1] // ...*................................................................................................................................. - ld4 {v22.4S, v23.4S, v24.4S, v25.4S}, [x2] // ..................................*.................................................................................................. - ldr q20, [x5, #64] // ......*.............................................................................................................................. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - ldr q31, [x5, #48] // .....*............................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - ldr q5, [x5, #80] // *.................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - add v14.4S, v24.4S, v25.4S // ............................................*........................................................................................ - sub v13.4S, v28.4S, v29.4S // .........*........................................................................................................................... - ldr q10, [x4, #32] // ......................*.............................................................................................................. - add v30.4S, v26.4S, v27.4S // ................*.................................................................................................................... - ldr q0, [x5, #112] // ..........*.......................................................................................................................... - // gap // ..................................................................................................................................... - add v3.4S, v28.4S, v29.4S // ...........*......................................................................................................................... - ldr q4, [x4, #16] // ....................*................................................................................................................ - // gap // ..................................................................................................................................... - sub v6.4S, v22.4S, v23.4S // .......................................*............................................................................................. - ldr q18, [x4], #64 // ..................*.................................................................................................................. - sqrdmulh v7.4S, v13.4S, v5.4S // ...............*..................................................................................................................... - sub v1.4S, v26.4S, v27.4S // .............*....................................................................................................................... - ldr q11, [x5, #160] // ..............*...................................................................................................................... - // gap // ..................................................................................................................................... - ldr q15, [x5, #16] // .*................................................................................................................................... - sub v9.4S, v30.4S, v3.4S // .....................*............................................................................................................... - mul v26.4S, v13.4S, v20.4S // ..........................*.......................................................................................................... - add v27.4S, v30.4S, v3.4S // ........................*............................................................................................................ - ldr q16, [x5, #128] // .......*............................................................................................................................. - // gap // ..................................................................................................................................... - add v3.4S, v22.4S, v23.4S // .........................................*........................................................................................... - sqrdmulh v30.4S, v1.4S, v31.4S // .......................*............................................................................................................. - ldr q28, [x5, #32] // ....*................................................................................................................................ - sub v19.4S, v24.4S, v25.4S // ..........................................*.......................................................................................... - ldr q25, [x5], #(12*16) // ..*.................................................................................................................................. - // gap // ..................................................................................................................................... - mls v26.4S, v7.4S, v8.S[0] // ...........................*......................................................................................................... - ldr q23, [x5, #-48] // ............*........................................................................................................................ - // gap // ..................................................................................................................................... - ldr q17, [x5, #-16] // .................*................................................................................................................... - sub v2.4S, v3.4S, v14.4S // ..............................................*...................................................................................... - // gap // ..................................................................................................................................... - add v22.4S, v3.4S, v14.4S // ...................................................*................................................................................. - mul v28.4S, v1.4S, v28.4S // ...................*................................................................................................................. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mls v28.4S, v30.4S, v8.S[0] // ............................*........................................................................................................ - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sqrdmulh v29.4S, v9.4S, v15.4S // ...............................*..................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mul v14.4S, v9.4S, v25.4S // .............................*....................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sub v12.4S, v28.4S, v26.4S // ..............................*...................................................................................................... - add v5.4S, v28.4S, v26.4S // ................................*.................................................................................................... - // gap // ..................................................................................................................................... - mul v24.4S, v19.4S, v11.4S // ..................................................*.................................................................................. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sqrdmulh v9.4S, v12.4S, v15.4S // .................................*................................................................................................... - trn1 v13.4S, v27.4S, v5.4S // ...................................*................................................................................................. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - trn2 v7.4S, v27.4S, v5.4S // .....................................*............................................................................................... - mls v14.4S, v29.4S, v8.S[0] // ......................................*.............................................................................................. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mul v1.4S, v12.4S, v25.4S // ....................................*................................................................................................ - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mls v1.4S, v9.4S, v8.S[0] // ........................................*............................................................................................ - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sqrdmulh v31.4S, v6.4S, v23.4S // .............................................*....................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sqrdmulh v19.4S, v19.4S, v17.4S // ...............................................*..................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - trn1 v9.4S, v14.4S, v1.4S // ................................................*.................................................................................... - mul v30.4S, v6.4S, v16.4S // ...........................................*......................................................................................... - trn2 v6.4S, v14.4S, v1.4S // .................................................*................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mls v30.4S, v31.4S, v8.S[0] // .....................................................*............................................................................... - // gap // ..................................................................................................................................... - trn2 v1.2D, v13.2D, v9.2D // .......................................................*............................................................................. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - trn2 v3.2D, v7.2D, v6.2D // ........................................................*............................................................................ - mls v24.4S, v19.4S, v8.S[0] // .........................................................*........................................................................... - trn1 v29.2D, v7.2D, v6.2D // ......................................................*.............................................................................. - // gap // ..................................................................................................................................... - trn1 v13.2D, v13.2D, v9.2D // ....................................................*................................................................................ - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - add v20.4S, v1.4S, v3.4S // ..............................................................*...................................................................... - // gap // ..................................................................................................................................... - mul v28.4S, v2.4S, v21.4S // ............................................................*........................................................................ - sub v14.4S, v1.4S, v3.4S // .............................................................*....................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - add v31.4S, v13.4S, v29.4S // ...........................................................*......................................................................... - sqrdmulh v3.4S, v2.4S, v0.4S // .................................................................*................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sub v12.4S, v30.4S, v24.4S // ................................................................*.................................................................... - sub v15.4S, v13.4S, v29.4S // ..........................................................*.......................................................................... - // gap // ..................................................................................................................................... - mul v7.4S, v14.4S, v10.S[0] // ............................................................................*........................................................ - add v27.4S, v30.4S, v24.4S // ..................................................................*.................................................................. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sqrdmulh v16.4S, v12.4S, v0.4S // .......................................................................*............................................................. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mul v9.4S, v12.4S, v21.4S // ....................................................................*................................................................ - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - trn2 v19.4S, v22.4S, v27.4S // ........................................................................*............................................................ - mls v28.4S, v3.4S, v8.S[0] // .........................................................................*........................................................... - trn1 v23.4S, v22.4S, v27.4S // ......................................................................*.............................................................. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - add v29.4S, v31.4S, v20.4S // .....................................................................*............................................................... - sqrdmulh v3.4S, v14.4S, v10.S[1] // ...............................................................................*..................................................... - sub v14.4S, v31.4S, v20.4S // ...................................................................*................................................................. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mls v9.4S, v16.4S, v8.S[0] // ...........................................................................*......................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mul v31.4S, v15.4S, v4.S[2] // ...............................................................*..................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sqrdmulh v24.4S, v15.4S, v4.S[3] // ..........................................................................*.......................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - trn1 v21.4S, v28.4S, v9.4S // ..............................................................................*...................................................... - mls v7.4S, v3.4S, v8.S[0] // .......................................................................................*............................................. - trn2 v26.4S, v28.4S, v9.4S // ................................................................................*.................................................... - ldr q9, [x4, #-16] // .........................*........................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sqrdmulh v20.4S, v14.4S, v18.S[3] // .................................................................................*................................................... - trn1 v13.2D, v23.2D, v21.2D // ...................................................................................*................................................. - trn1 v3.2D, v19.2D, v26.2D // .....................................................................................*............................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - trn2 v5.2D, v23.2D, v21.2D // ..................................................................................*.................................................. - mls v31.4S, v24.4S, v8.S[0] // .............................................................................*....................................................... - // gap // ..................................................................................................................................... - trn2 v21.2D, v19.2D, v26.2D // ......................................................................................*.............................................. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mul v24.4S, v14.4S, v18.S[2] // ....................................................................................*................................................ - // gap // ..................................................................................................................................... - add v26.4S, v13.4S, v3.4S // ..........................................................................................*.......................................... - sub v0.4S, v13.4S, v3.4S // ........................................................................................*............................................ - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mls v24.4S, v20.4S, v8.S[0] // .........................................................................................*........................................... - sub v3.4S, v5.4S, v21.4S // ...........................................................................................*......................................... - // gap // ..................................................................................................................................... - add v11.4S, v5.4S, v21.4S // ............................................................................................*........................................ - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mul v20.4S, v0.4S, v10.S[2] // .............................................................................................*....................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sqrdmulh v19.4S, v0.4S, v10.S[3] // ...............................................................................................*..................................... - sub v22.4S, v26.4S, v11.4S // .................................................................................................*................................... - // gap // ..................................................................................................................................... - add v16.4S, v26.4S, v11.4S // ..................................................................................................*.................................. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sqrdmulh v28.4S, v3.4S, v9.S[1] // ...................................................................................................*................................. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mul v9.4S, v3.4S, v9.S[0] // ....................................................................................................*................................ - add v3.4S, v31.4S, v7.4S // ................................................................................................*.................................... - // gap // ..................................................................................................................................... - add v12.4S, v29.4S, v16.4S // ......................................................................................................*.............................. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sub v10.4S, v29.4S, v16.4S // .....................................................................................................*............................... - mls v20.4S, v19.4S, v8.S[0] // .......................................................................................................*............................. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mls v9.4S, v28.4S, v8.S[0] // ........................................................................................................*............................ - str q12, [x1], #(16*4) // .........................................................................................................*........................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sqrdmulh v29.4S, v10.4S, v18.S[1] // .........................................................................................................................*........... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sqrdmulh v26.4S, v22.4S, v4.S[1] // .............................................................................................................*....................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sub v17.4S, v20.4S, v9.4S // ............................................................................................................*........................ - add v28.4S, v20.4S, v9.4S // ..............................................................................................................*...................... - mul v20.4S, v22.4S, v4.S[0] // ...........................................................................................................*......................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mul v16.4S, v17.4S, v4.S[0] // ...............................................................................................................*..................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sub v1.4S, v31.4S, v7.4S // ..............................................................................................*...................................... - sqrdmulh v21.4S, v17.4S, v4.S[1] // .................................................................................................................*................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mls v20.4S, v26.4S, v8.S[0] // ....................................................................................................................*................ - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mul v15.4S, v1.4S, v18.S[2] // ..........................................................................................................*.......................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sqrdmulh v25.4S, v1.4S, v18.S[3] // ..................................................................................................................*.................. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sub v5.4S, v24.4S, v20.4S // .......................................................................................................................*............. - add v20.4S, v24.4S, v20.4S // ........................................................................................................................*............ - mul v6.4S, v10.4S, v18.S[0] // ..........................................................................................................................*.......... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mls v16.4S, v21.4S, v8.S[0] // .....................................................................................................................*............... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - str q20, [x1, #-32] // ...........................................................................................................................*......... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mls v15.4S, v25.4S, v8.S[0] // ......................................................................................................................*.............. - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mls v6.4S, v29.4S, v8.S[0] // ...............................................................................................................................*..... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sqrdmulh v20.4S, v5.4S, v18.S[1] // ..............................................................................................................................*...... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sub v21.4S, v15.4S, v16.4S // ............................................................................................................................*........ - mul v31.4S, v5.4S, v18.S[0] // .................................................................................................................................*... - // gap // ..................................................................................................................................... - add v16.4S, v15.4S, v16.4S // .............................................................................................................................*....... - str q6, [x2], #(16*4) // ...................................................................................................................................*. - // gap // ..................................................................................................................................... - add v5.4S, v3.4S, v28.4S // ................................................................................................................*.................... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - sqrdmulh v9.4S, v21.4S, v18.S[1] // ....................................................................................................................................* - str q16, [x1, #-16] // ................................................................................................................................*.... - // gap // ..................................................................................................................................... - // gap // ..................................................................................................................................... - mls v31.4S, v20.4S, v8.S[0] // ..................................................................................................................................*.. - str q5, [x1, #-48] // ...................................................................................................................*................. - // gap // ..................................................................................................................................... + ldr q20, [x5, #16] // ..........*................................................................................................................................. + ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x2] // ..............................*............................................................................................................. + ld4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x1] // .......*.................................................................................................................................... + ldr q1, [x5, #96] // ..*......................................................................................................................................... + ldr q4, [x5, #112] // .......................................*.................................................................................................... + // gap // ............................................................................................................................................ + ldr q7, [x5, #48] // ........*................................................................................................................................... + ldr q5, [x5, #64] // .*.......................................................................................................................................... + // gap // ............................................................................................................................................ + ldr q6, [x5, #32] // ....*....................................................................................................................................... + ldr q30, [x5, #144] // .....*...................................................................................................................................... + // gap // ............................................................................................................................................ + sub v10.4S, v23.4S, v24.4S // ...........*................................................................................................................................ + sub v2.4S, v14.4S, v15.4S // ..................................*......................................................................................................... + ldr q25, [x5, #128] // ......*..................................................................................................................................... + add v29.4S, v16.4S, v17.4S // ....................................*....................................................................................................... + add v3.4S, v14.4S, v15.4S // .....................................*...................................................................................................... + ldr q0, [x5, #160] // ......................................*..................................................................................................... + sub v13.4S, v16.4S, v17.4S // ..........................................*................................................................................................. + add v14.4S, v23.4S, v24.4S // ............*............................................................................................................................... + ldr q18, [x5, #176] // ...*........................................................................................................................................ + sqrdmulh v24.4S, v2.4S, v30.4S // ...........................................*................................................................................................ + add v28.4S, v21.4S, v22.4S // .............*.............................................................................................................................. + ldr q15, [x5], #(12*16) // .....................*...................................................................................................................... + sub v9.4S, v3.4S, v29.4S // ............................................*............................................................................................... + ldr q16, [x5, #-112] // .........*.................................................................................................................................. + // gap // ............................................................................................................................................ + sub v22.4S, v21.4S, v22.4S // ..............*............................................................................................................................. + mul v19.4S, v2.4S, v25.4S // ........................................*................................................................................................... + // gap // ............................................................................................................................................ + sub v12.4S, v28.4S, v14.4S // ..................*......................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v2.4S, v13.4S, v18.4S // ...............................................*............................................................................................ + add v18.4S, v28.4S, v14.4S // .................*.......................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v14.4S, v9.4S, v4.4S // .................................................*.......................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v13.4S, v13.4S, v0.4S // ..............................................................*............................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v19.4S, v24.4S, v8.S[0] // ...................................................*........................................................................................ + add v24.4S, v3.4S, v29.4S // ..............................................*............................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v13.4S, v2.4S, v8.S[0] // ................................................................*........................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v31.4S, v9.4S, v1.4S // ......................................................*..................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v31.4S, v14.4S, v8.S[0] // ..........................................................*................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v26.4S, v19.4S, v13.4S // .......................................................................*.................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v3.4S, v22.4S, v7.4S // ...................*........................................................................................................................ + add v11.4S, v19.4S, v13.4S // ...........................................................................*................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v19.4S, v26.4S, v4.4S // ..........................................................................*................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v14.4S, v24.4S, v11.4S // ..............................................................................*............................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v7.4S, v24.4S, v11.4S // ...................................................................................*........................................................ + sqrdmulh v11.4S, v12.4S, v20.4S // ........................*................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v4.4S, v26.4S, v1.4S // ...............................................................................*............................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v4.4S, v19.4S, v8.S[0] // ................................................................................*........................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v29.4S, v10.4S, v5.4S // ...............*............................................................................................................................ + ldr q0, [x4, #32] // .........................................*.................................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v26.4S, v12.4S, v15.4S // ..........................*................................................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v30.4S, v10.4S, v16.4S // ................*........................................................................................................................... + trn1 v17.4S, v31.4S, v4.4S // ......................................................................................*..................................................... + // gap // ............................................................................................................................................ + trn2 v23.4S, v31.4S, v4.4S // ........................................................................................*................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v31.4S, v22.4S, v6.4S // ....................*....................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v12.2D, v14.2D, v17.2D // ..........................................................................................*................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v31.4S, v3.4S, v8.S[0] // .......................*.................................................................................................................... + trn2 v3.2D, v7.2D, v23.2D // ............................................................................................*............................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v29.4S, v30.4S, v8.S[0] // ......................*..................................................................................................................... + ldr q30, [x4, #48] // *........................................................................................................................................... + // gap // ............................................................................................................................................ + sub v25.4S, v12.4S, v3.4S // .................................................................................................*.......................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v26.4S, v11.4S, v8.S[0] // ...............................*............................................................................................................ + trn1 v11.2D, v7.2D, v23.2D // ...............................................................................................*............................................ + // gap // ............................................................................................................................................ + trn1 v23.2D, v14.2D, v17.2D // .............................................................................................*.............................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v5.4S, v12.4S, v3.4S // ..................................................................................................*......................................... + sqrdmulh v4.4S, v25.4S, v30.S[1] // ..........................................................................................................*................................. + // gap // ............................................................................................................................................ + sub v3.4S, v31.4S, v29.4S // .........................*.................................................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v22.4S, v25.4S, v30.S[0] // ......................................................................................................*..................................... + add v16.4S, v23.4S, v11.4S // .....................................................................................................*...................................... + // gap // ............................................................................................................................................ + add v13.4S, v31.4S, v29.4S // ...........................*................................................................................................................ + ldr q31, [x4, #16] // .....................................................*...................................................................................... + // gap // ............................................................................................................................................ + sub v24.4S, v23.4S, v11.4S // ....................................................................................................*....................................... + mul v2.4S, v3.4S, v15.4S // .................................*.......................................................................................................... + // gap // ............................................................................................................................................ + sub v1.4S, v16.4S, v5.4S // .........................................................................................................*.................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v16.4S, v16.4S, v5.4S // ........................................................................................................*................................... + sqrdmulh v21.4S, v3.4S, v20.4S // ............................*............................................................................................................... + // gap // ............................................................................................................................................ + trn2 v3.4S, v18.4S, v13.4S // .............................*.............................................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v18.4S, v18.4S, v13.4S // ................................*........................................................................................................... + mls v22.4S, v4.4S, v8.S[0] // ...............................................................................................................*............................ + // gap // ............................................................................................................................................ + srshr v29.4S, v16.4S, #23 // ...........................................................................................................*................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v25.4S, v24.4S, v0.S[3] // .......................................................................................................*.................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v2.4S, v21.4S, v8.S[0] // ...................................*........................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v14.4S, v24.4S, v0.S[2] // ............................................................................................................*............................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v14.4S, v25.4S, v8.S[0] // .............................................................................................................*.............................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v23.4S, v26.4S, v2.4S // .............................................*.............................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v30.4S, v26.4S, v2.4S // ................................................*........................................................................................... + sqrdmulh v12.4S, v1.4S, v31.S[1] // ..............................................................................................................*............................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v9.4S, v1.4S, v31.S[0] // ................................................................................................................*........................... + trn1 v11.2D, v18.2D, v23.2D // ..................................................*......................................................................................... + // gap // ............................................................................................................................................ + sub v4.4S, v14.4S, v22.4S // ..................................................................................................................*......................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v16.4S, v29.4S, v8.4S // .................................................................................................................*.......................... + add v22.4S, v14.4S, v22.4S // ....................................................................................................................*....................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v6.4S, v4.4S, v31.S[0] // .....................................................................................................................*...................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + srshr v2.4S, v22.4S, #23 // .........................................................................................................................*.................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v9.4S, v12.4S, v8.S[0] // ...................................................................................................................*........................ + trn1 v29.2D, v3.2D, v30.2D // ....................................................*....................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v12.4S, v4.4S, v31.S[1] // .......................................................................................................................*.................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v19.4S, v11.4S, v29.4S // ........................................................*................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v20.2D, v18.2D, v23.2D // .........................................................*.................................................................................. + mls v22.4S, v2.4S, v8.4S // ..............................................................................................................................*............. + // gap // ............................................................................................................................................ + trn2 v23.2D, v3.2D, v30.2D // .......................................................*.................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v10.4S, v19.4S, v31.S[2] // .....................................................................*...................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v2.4S, v11.4S, v29.4S // ...........................................................*................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v19.4S, v19.4S, v31.S[3] // ............................................................*............................................................................... + add v27.4S, v20.4S, v23.4S // .............................................................*.............................................................................. + // gap // ............................................................................................................................................ + sub v15.4S, v20.4S, v23.4S // ...............................................................*............................................................................ + ldr q20, [x4], #64 // ..................................................................*......................................................................... + // gap // ............................................................................................................................................ + mls v6.4S, v12.4S, v8.S[0] // ...............................................................................................................................*............ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v28.4S, v2.4S, v27.4S // .................................................................*.......................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v26.4S, v2.4S, v27.4S // ...................................................................*........................................................................ + sqrdmulh v2.4S, v15.4S, v0.S[1] // ....................................................................*....................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v0.4S, v15.4S, v0.S[0] // ........................................................................*................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + srshr v13.4S, v26.4S, #23 // ......................................................................*..................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v23.4S, v28.4S, v20.S[3] // ............................................................................*............................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v0.4S, v2.4S, v8.S[0] // .........................................................................*.................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v10.4S, v19.4S, v8.S[0] // .............................................................................*.............................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v26.4S, v13.4S, v8.4S // ..................................................................................*......................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v11.4S, v28.4S, v20.S[2] // .....................................................................................*...................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v27.4S, v10.4S, v0.4S // .................................................................................*.......................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v11.4S, v23.4S, v8.S[0] // .......................................................................................*.................................................... + add v14.4S, v10.4S, v0.4S // ....................................................................................*....................................................... + // gap // ............................................................................................................................................ + sub v2.4S, v26.4S, v16.4S // ......................................................................................................................*..................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v0.4S, v26.4S, v16.4S // ........................................................................................................................*................... + sqrdmulh v4.4S, v27.4S, v20.S[3] // ...........................................................................................*................................................ + // gap // ............................................................................................................................................ + srshr v23.4S, v14.4S, #23 // .........................................................................................*.................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v7.4S, v27.4S, v20.S[2] // ..............................................................................................*............................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q0, [x1], #(16*4) // ............................................................................................................................*............... + sub v19.4S, v11.4S, v9.4S // ...........................................................................................................................*................ + // gap // ............................................................................................................................................ + add v9.4S, v11.4S, v9.4S // .............................................................................................................................*.............. + mls v14.4S, v23.4S, v8.4S // ................................................................................................*........................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v16.4S, v2.4S, v20.S[1] // ..........................................................................................................................*................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v7.4S, v4.4S, v8.S[0] // ...................................................................................................*........................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v15.4S, v14.4S, v22.4S // .................................................................................................................................*.......... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v30.4S, v14.4S, v22.4S // ..................................................................................................................................*......... + mul v2.4S, v2.4S, v20.S[0] // ................................................................................................................................*........... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v2.4S, v16.4S, v8.S[0] // ...................................................................................................................................*........ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q30, [x1, #-48] // .......................................................................................................................................*.... + sub v31.4S, v7.4S, v6.4S // ....................................................................................................................................*....... + // gap // ............................................................................................................................................ + add v4.4S, v7.4S, v6.4S // .....................................................................................................................................*...... + mul v23.4S, v19.4S, v20.S[0] // ......................................................................................................................................*..... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v11.4S, v15.4S, v20.S[1] // ........................................................................................................................................*... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q4, [x1, #-16] // ..........................................................................................................................................*. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q2, [x2], #(16*4) // .........................................................................................................................................*.. + sqrdmulh v4.4S, v31.4S, v20.S[1] // ...........................................................................................................................................* + // gap // ............................................................................................................................................ // original source code - // ldr q14, [x5, #80] // .....*............................................................................................................................... - // ldr q2, [x5, #16] // ..................*.................................................................................................................. - // ldr q3, [x5], #(12*16) // ...........................*......................................................................................................... - // ld4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1] // .*................................................................................................................................... - // ldr q28, [x5, #-160] // .........................*........................................................................................................... - // ldr q6, [x5, #-144] // ....*................................................................................................................................ - // ldr q15, [x5, #-128] // ...*................................................................................................................................. - // ldr q29, [x5, #-64] // ......................*.............................................................................................................. - // ldr q1, [x5, #-96] // *.................................................................................................................................... - // sub v13.4S, v26.4S, v27.4S // .......*............................................................................................................................. - // ldr q16, [x5, #-80] // ..........*.......................................................................................................................... - // add v26.4S, v26.4S, v27.4S // ...........*......................................................................................................................... - // ldr q9, [x5, #-48] // .............................*....................................................................................................... - // sub v0.4S, v24.4S, v25.4S // ................*.................................................................................................................... - // ldr q30, [x5, #-32] // .................*................................................................................................................... - // sqrdmulh v14.4S, v13.4S, v14.4S // ...............*..................................................................................................................... - // add v24.4S, v24.4S, v25.4S // .........*........................................................................................................................... - // ldr q7, [x5, #-16] // ..............................*...................................................................................................... - // ldr q18, [x4], #64 // ..............*...................................................................................................................... - // mul v28.4S, v0.4S, v28.4S // .................................*................................................................................................... - // ldr q25, [x4, #-48] // ............*........................................................................................................................ - // sub v21.4S, v24.4S, v26.4S // ...................*................................................................................................................. - // ldr q12, [x4, #-32] // ........*............................................................................................................................ - // sqrdmulh v6.4S, v0.4S, v6.4S // ........................*............................................................................................................ - // add v24.4S, v24.4S, v26.4S // .....................*............................................................................................................... - // ldr q26, [x4, #-16] // ................................................................................*.................................................... - // mul v15.4S, v13.4S, v15.4S // ....................*................................................................................................................ - // mls v15.4S, v14.4S, v8.S[0] // ............................*........................................................................................................ - // mls v28.4S, v6.4S, v8.S[0] // ..................................*.................................................................................................. - // mul v17.4S, v21.4S, v3.4S // ....................................*................................................................................................ - // sub v14.4S, v28.4S, v15.4S // .....................................*............................................................................................... - // sqrdmulh v6.4S, v21.4S, v2.4S // ...................................*................................................................................................. - // add v28.4S, v28.4S, v15.4S // ......................................*.............................................................................................. - // sqrdmulh v2.4S, v14.4S, v2.4S // ........................................*............................................................................................ - // ld4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x2] // ..*.................................................................................................................................. - // trn1 v31.4S, v24.4S, v28.4S // .........................................*........................................................................................... - // mul v14.4S, v14.4S, v3.4S // ............................................*........................................................................................ - // trn2 v3.4S, v24.4S, v28.4S // ..........................................*.......................................................................................... - // mls v17.4S, v6.4S, v8.S[0] // ...........................................*......................................................................................... - // sub v24.4S, v19.4S, v20.4S // .............*....................................................................................................................... - // mls v14.4S, v2.4S, v8.S[0] // .............................................*....................................................................................... - // add v2.4S, v19.4S, v20.4S // .......................*............................................................................................................. - // sub v28.4S, v21.4S, v22.4S // ..........................*.......................................................................................................... - // mul v6.4S, v24.4S, v29.4S // .................................................*................................................................................... - // add v21.4S, v21.4S, v22.4S // ......*.............................................................................................................................. - // sqrdmulh v24.4S, v24.4S, v9.4S // ..............................................*...................................................................................... - // sub v15.4S, v2.4S, v21.4S // ...............................*..................................................................................................... - // sqrdmulh v29.4S, v28.4S, v7.4S // ...............................................*..................................................................................... - // trn1 v13.4S, v17.4S, v14.4S // ................................................*.................................................................................... - // trn2 v17.4S, v17.4S, v14.4S // ..................................................*.................................................................................. - // mul v14.4S, v28.4S, v30.4S // .......................................*............................................................................................. - // add v21.4S, v2.4S, v21.4S // ................................*.................................................................................................... - // trn1 v2.2D, v31.2D, v13.2D // ........................................................*............................................................................ - // mls v6.4S, v24.4S, v8.S[0] // ...................................................*................................................................................. - // trn1 v24.2D, v3.2D, v17.2D // .......................................................*............................................................................. - // trn2 v28.2D, v31.2D, v13.2D // ....................................................*................................................................................ - // trn2 v17.2D, v3.2D, v17.2D // .....................................................*............................................................................... - // mls v14.4S, v29.4S, v8.S[0] // ......................................................*.............................................................................. - // sub v3.4S, v2.4S, v24.4S // ...............................................................*..................................................................... - // add v2.4S, v2.4S, v24.4S // ............................................................*........................................................................ - // mul v24.4S, v15.4S, v1.4S // ..........................................................*.......................................................................... - // sub v31.4S, v28.4S, v17.4S // ...........................................................*......................................................................... - // add v17.4S, v28.4S, v17.4S // .........................................................*........................................................................... - // mul v28.4S, v3.4S, v25.S[2] // ...........................................................................*......................................................... - // sub v29.4S, v6.4S, v14.4S // ..............................................................*...................................................................... - // sqrdmulh v15.4S, v15.4S, v16.4S // .............................................................*....................................................................... - // add v14.4S, v6.4S, v14.4S // .................................................................*................................................................... - // sub v6.4S, v2.4S, v17.4S // .........................................................................*........................................................... - // mul v1.4S, v29.4S, v1.4S // ...................................................................*................................................................. - // add v17.4S, v2.4S, v17.4S // .......................................................................*............................................................. - // trn1 v2.4S, v21.4S, v14.4S // ......................................................................*.............................................................. - // sqrdmulh v29.4S, v29.4S, v16.4S // ..................................................................*.................................................................. - // trn2 v21.4S, v21.4S, v14.4S // ....................................................................*................................................................ - // mls v24.4S, v15.4S, v8.S[0] // .....................................................................*............................................................... - // sqrdmulh v14.4S, v3.4S, v25.S[3] // ............................................................................*........................................................ - // mls v1.4S, v29.4S, v8.S[0] // ..........................................................................*.......................................................... - // mul v3.4S, v31.4S, v12.S[0] // ................................................................*.................................................................... - // mls v28.4S, v14.4S, v8.S[0] // .....................................................................................*............................................... - // trn1 v14.4S, v24.4S, v1.4S // .............................................................................*....................................................... - // sqrdmulh v31.4S, v31.4S, v12.S[1] // ........................................................................*............................................................ - // trn2 v24.4S, v24.4S, v1.4S // ...............................................................................*..................................................... - // sqrdmulh v1.4S, v6.4S, v18.S[3] // .................................................................................*................................................... - // trn2 v15.2D, v2.2D, v14.2D // ....................................................................................*................................................ - // trn1 v14.2D, v2.2D, v14.2D // ..................................................................................*.................................................. - // mul v2.4S, v6.4S, v18.S[2] // .......................................................................................*............................................. - // trn1 v6.2D, v21.2D, v24.2D // ...................................................................................*................................................. - // trn2 v21.2D, v21.2D, v24.2D // ......................................................................................*.............................................. - // mls v3.4S, v31.4S, v8.S[0] // ..............................................................................*...................................................... - // sub v24.4S, v14.4S, v6.4S // .........................................................................................*........................................... - // mls v2.4S, v1.4S, v8.S[0] // ..........................................................................................*.......................................... - // add v14.4S, v14.4S, v6.4S // ........................................................................................*............................................ - // sub v6.4S, v15.4S, v21.4S // ...........................................................................................*......................................... - // add v21.4S, v15.4S, v21.4S // ............................................................................................*........................................ - // mul v31.4S, v24.4S, v12.S[2] // .............................................................................................*....................................... - // sub v1.4S, v28.4S, v3.4S // ...............................................................................................................*..................... - // sqrdmulh v24.4S, v24.4S, v12.S[3] // ..............................................................................................*...................................... - // add v3.4S, v28.4S, v3.4S // ...................................................................................................*................................. - // sub v28.4S, v14.4S, v21.4S // ...............................................................................................*..................................... - // add v21.4S, v14.4S, v21.4S // ................................................................................................*.................................... - // sqrdmulh v14.4S, v6.4S, v26.S[1] // .................................................................................................*................................... - // mul v6.4S, v6.4S, v26.S[0] // ..................................................................................................*.................................. - // sub v26.4S, v17.4S, v21.4S // .....................................................................................................*............................... - // add v17.4S, v17.4S, v21.4S // ....................................................................................................*................................ - // mls v31.4S, v24.4S, v8.S[0] // ......................................................................................................*.............................. - // mls v6.4S, v14.4S, v8.S[0] // .......................................................................................................*............................. - // str q17, [x1], #(16*4) // ........................................................................................................*............................ - // mul v17.4S, v1.4S, v18.S[2] // ..................................................................................................................*.................. - // mul v21.4S, v28.4S, v25.S[0] // .............................................................................................................*....................... - // sub v14.4S, v31.4S, v6.4S // ...........................................................................................................*......................... - // sqrdmulh v24.4S, v28.4S, v25.S[1] // ..........................................................................................................*.......................... - // add v28.4S, v31.4S, v6.4S // ............................................................................................................*........................ - // mul v6.4S, v14.4S, v25.S[0] // ..............................................................................................................*...................... - // add v31.4S, v3.4S, v28.4S // ................................................................................................................................*.... - // sqrdmulh v14.4S, v14.4S, v25.S[1] // ................................................................................................................*.................... - // sqrdmulh v1.4S, v1.4S, v18.S[3] // ...................................................................................................................*................. - // str q31, [x1, #-48] // ....................................................................................................................................* - // mls v21.4S, v24.4S, v8.S[0] // .................................................................................................................*................... - // mls v6.4S, v14.4S, v8.S[0] // .......................................................................................................................*............. - // mls v17.4S, v1.4S, v8.S[0] // .........................................................................................................................*........... - // sub v14.4S, v2.4S, v21.4S // ....................................................................................................................*................ - // add v21.4S, v2.4S, v21.4S // .....................................................................................................................*............... - // sqrdmulh v2.4S, v26.4S, v18.S[1] // .........................................................................................................*........................... - // mul v24.4S, v26.4S, v18.S[0] // ......................................................................................................................*.............. - // str q21, [x1, #-32] // ........................................................................................................................*............ - // sub v21.4S, v17.4S, v6.4S // ............................................................................................................................*........ - // add v17.4S, v17.4S, v6.4S // ..............................................................................................................................*...... - // sqrdmulh v6.4S, v14.4S, v18.S[1] // ...........................................................................................................................*......... - // mls v24.4S, v2.4S, v8.S[0] // ..........................................................................................................................*.......... - // str q17, [x1, #-16] // ..................................................................................................................................*.. - // mul v31.4S, v14.4S, v18.S[0] // .............................................................................................................................*....... - // mls v31.4S, v6.4S, v8.S[0] // ...................................................................................................................................*. - // str q24, [x2], #(16*4) // ...............................................................................................................................*..... - // sqrdmulh v9.4S, v21.4S, v18.S[1] // .................................................................................................................................*... + // ldr q21, [x4, #48] // .......................................................*.................................................................................... + // ldr q24, [x5, #64] // ......*..................................................................................................................................... + // ldr q3, [x5, #96] // ...*........................................................................................................................................ + // ldr q10, [x5, #176] // .................*.......................................................................................................................... + // ldr q7, [x5, #32] // .......*.................................................................................................................................... + // ldr q12, [x5, #144] // ........*................................................................................................................................... + // ldr q22, [x5, #128] // ...........*................................................................................................................................ + // ld4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1] // ..*......................................................................................................................................... + // ldr q29, [x5, #48] // .....*...................................................................................................................................... + // ldr q20, [x5, #80] // ......................*..................................................................................................................... + // ldr q18, [x5, #16] // *........................................................................................................................................... + // sub v9.4S, v27.4S, v28.4S // .........*.................................................................................................................................. + // add v27.4S, v27.4S, v28.4S // ................*........................................................................................................................... + // add v19.4S, v25.4S, v26.4S // ...................*........................................................................................................................ + // sub v26.4S, v25.4S, v26.4S // .......................*.................................................................................................................... + // mul v6.4S, v9.4S, v24.4S // ............................................*............................................................................................... + // sqrdmulh v5.4S, v9.4S, v20.4S // ...............................................*............................................................................................ + // add v31.4S, v19.4S, v27.4S // ...........................*................................................................................................................ + // sub v25.4S, v19.4S, v27.4S // .........................*.................................................................................................................. + // sqrdmulh v15.4S, v26.4S, v29.4S // ....................................*....................................................................................................... + // mul v2.4S, v26.4S, v7.4S // ..................................................*......................................................................................... + // ldr q14, [x5], #(12*16) // ....................*....................................................................................................................... + // mls v6.4S, v5.4S, v8.S[0] // ......................................................*..................................................................................... + // mls v2.4S, v15.4S, v8.S[0] // ....................................................*....................................................................................... + // sqrdmulh v15.4S, v25.4S, v18.4S // .........................................*.................................................................................................. + // sub v16.4S, v2.4S, v6.4S // ..............................................................*............................................................................. + // mul v13.4S, v25.4S, v14.4S // ..............................................*............................................................................................. + // add v28.4S, v2.4S, v6.4S // .................................................................*.......................................................................... + // sqrdmulh v11.4S, v16.4S, v18.4S // .......................................................................*.................................................................... + // trn2 v23.4S, v31.4S, v28.4S // ........................................................................*................................................................... + // ld4 {v4.4S, v5.4S, v6.4S, v7.4S}, [x2] // .*.......................................................................................................................................... + // mls v13.4S, v15.4S, v8.S[0] // .........................................................*.................................................................................. + // trn1 v17.4S, v31.4S, v28.4S // .........................................................................*.................................................................. + // mul v0.4S, v16.4S, v14.4S // ....................................................................*....................................................................... + // sub v26.4S, v4.4S, v5.4S // ..........*................................................................................................................................. + // mls v0.4S, v11.4S, v8.S[0] // .............................................................................*.............................................................. + // add v31.4S, v6.4S, v7.4S // ............*............................................................................................................................... + // add v24.4S, v4.4S, v5.4S // .............*.............................................................................................................................. + // ldr q11, [x5, #-32] // ..............*............................................................................................................................. + // ldr q18, [x5, #-80] // ....*....................................................................................................................................... + // mul v9.4S, v26.4S, v22.4S // ........................*................................................................................................................... + // ldr q5, [x4, #32] // .............................................*.............................................................................................. + // sub v16.4S, v6.4S, v7.4S // ...............*............................................................................................................................ + // sqrdmulh v30.4S, v26.4S, v12.4S // ..................*......................................................................................................................... + // sub v15.4S, v24.4S, v31.4S // .....................*...................................................................................................................... + // trn1 v26.4S, v13.4S, v0.4S // ................................................................................*........................................................... + // add v24.4S, v24.4S, v31.4S // ...............................*............................................................................................................ + // sqrdmulh v27.4S, v16.4S, v10.4S // ..........................*................................................................................................................. + // trn2 v6.4S, v13.4S, v0.4S // .................................................................................*.......................................................... + // sqrdmulh v25.4S, v15.4S, v18.4S // ............................*............................................................................................................... + // trn1 v19.2D, v17.2D, v26.2D // ....................................................................................*....................................................... + // mls v9.4S, v30.4S, v8.S[0] // ..............................*............................................................................................................. + // trn1 v28.2D, v23.2D, v6.2D // ...........................................................................................*................................................ + // ldr q7, [x4, #16] // ..................................................................*......................................................................... + // mul v0.4S, v15.4S, v3.4S // .................................*.......................................................................................................... + // trn2 v15.2D, v23.2D, v6.2D // ................................................................................................*........................................... + // sub v4.4S, v19.4S, v28.4S // .............................................................................................*.............................................. + // trn2 v23.2D, v17.2D, v26.2D // ..............................................................................................*............................................. + // mls v0.4S, v25.4S, v8.S[0] // ..................................*......................................................................................................... + // add v13.4S, v19.4S, v28.4S // ..................................................................................................*......................................... + // sqrdmulh v26.4S, v4.4S, v7.S[3] // ...................................................................................................*........................................ + // add v1.4S, v23.4S, v15.4S // ....................................................................................................*....................................... + // mul v31.4S, v16.4S, v11.4S // .............................*.............................................................................................................. + // sub v29.4S, v23.4S, v15.4S // .....................................................................................................*...................................... + // mls v31.4S, v27.4S, v8.S[0] // ................................*........................................................................................................... + // sub v27.4S, v13.4S, v1.4S // ........................................................................................................*................................... + // ldr q20, [x4], #64 // ......................................................................................................*..................................... + // add v25.4S, v13.4S, v1.4S // .........................................................................................................*.................................. + // sqrdmulh v6.4S, v29.4S, v5.S[1] // ..........................................................................................................*................................. + // mul v28.4S, v4.4S, v7.S[2] // .................................................................................................*.......................................... + // srshr v15.4S, v25.4S, #23 // ............................................................................................................*............................... + // sub v17.4S, v9.4S, v31.4S // ...................................*........................................................................................................ + // mul v13.4S, v29.4S, v5.S[0] // ...........................................................................................................*................................ + // mls v13.4S, v6.4S, v8.S[0] // ..............................................................................................................*............................. + // sqrdmulh v16.4S, v17.4S, v18.4S // ......................................*..................................................................................................... + // add v10.4S, v9.4S, v31.4S // .....................................*...................................................................................................... + // sqrdmulh v23.4S, v27.4S, v20.S[3] // .............................................................................................................*.............................. + // mls v28.4S, v26.4S, v8.S[0] // ...............................................................................................................*............................ + // trn1 v30.4S, v24.4S, v10.4S // .......................................*.................................................................................................... + // mul v12.4S, v17.4S, v3.4S // ..........................................*................................................................................................. + // mls v12.4S, v16.4S, v8.S[0] // ...........................................*................................................................................................ + // sub v22.4S, v28.4S, v13.4S // ..................................................................................................................*......................... + // mls v25.4S, v15.4S, v8.4S // ................................................................................................................*........................... + // trn2 v15.4S, v24.4S, v10.4S // ........................................*................................................................................................... + // add v31.4S, v28.4S, v13.4S // ....................................................................................................................*....................... + // mul v1.4S, v27.4S, v20.S[2] // .................................................................................................................*.......................... + // trn1 v2.4S, v0.4S, v12.4S // ................................................*........................................................................................... + // mls v1.4S, v23.4S, v8.S[0] // ...................................................................................................................*........................ + // trn2 v10.4S, v0.4S, v12.4S // .................................................*.......................................................................................... + // srshr v28.4S, v31.4S, #23 // ........................................................................................................................*................... + // trn2 v11.2D, v30.2D, v2.2D // ...................................................*........................................................................................ + // sqrdmulh v19.4S, v22.4S, v20.S[3] // .......................................................................................................................*.................... + // trn2 v12.2D, v15.2D, v10.2D // .....................................................*...................................................................................... + // trn1 v14.2D, v30.2D, v2.2D // ...........................................................*................................................................................ + // mul v6.4S, v22.4S, v20.S[2] // .........................................................................................................................*.................. + // trn1 v2.2D, v15.2D, v10.2D // ..........................................................*................................................................................. + // mls v31.4S, v28.4S, v8.4S // .............................................................................................................................*.............. + // sub v13.4S, v11.4S, v12.4S // ........................................................*................................................................................... + // add v23.4S, v11.4S, v12.4S // ............................................................*............................................................................... + // mls v6.4S, v19.4S, v8.S[0] // ...............................................................................................................................*............ + // sub v17.4S, v14.4S, v2.4S // ...................................................................*........................................................................ + // add v26.4S, v14.4S, v2.4S // ................................................................*........................................................................... + // mul v24.4S, v13.4S, v21.S[0] // ...............................................................*............................................................................ + // sqrdmulh v19.4S, v17.4S, v5.S[3] // ............................................................................*............................................................... + // add v16.4S, v26.4S, v23.4S // ......................................................................*..................................................................... + // sub v18.4S, v26.4S, v23.4S // .....................................................................*...................................................................... + // sqrdmulh v23.4S, v13.4S, v21.S[1] // .............................................................*.............................................................................. + // srshr v27.4S, v16.4S, #23 // ...........................................................................*................................................................ + // mul v17.4S, v17.4S, v5.S[2] // ..............................................................................*............................................................. + // mls v17.4S, v19.4S, v8.S[0] // ...............................................................................*............................................................ + // sqrdmulh v2.4S, v18.4S, v7.S[1] // ..................................................................................*......................................................... + // mls v24.4S, v23.4S, v8.S[0] // ..........................................................................*................................................................. + // mul v10.4S, v18.4S, v7.S[0] // ...................................................................................*........................................................ + // mls v16.4S, v27.4S, v8.4S // ......................................................................................*..................................................... + // sub v19.4S, v17.4S, v24.4S // .....................................................................................*...................................................... + // mls v10.4S, v2.4S, v8.S[0] // ..........................................................................................*................................................. + // add v2.4S, v17.4S, v24.4S // .......................................................................................*.................................................... + // mul v14.4S, v19.4S, v7.S[0] // ........................................................................................*................................................... + // sub v12.4S, v25.4S, v16.4S // .....................................................................................................................*...................... + // sqrdmulh v24.4S, v19.4S, v7.S[1] // ............................................................................................*............................................... + // add v9.4S, v25.4S, v16.4S // ......................................................................................................................*..................... + // srshr v16.4S, v2.4S, #23 // .........................................................................................*.................................................. + // sqrdmulh v0.4S, v12.4S, v20.S[1] // ..............................................................................................................................*............. + // sub v19.4S, v1.4S, v10.4S // ...........................................................................................................................*................ + // str q9, [x1], #(16*4) // ..........................................................................................................................*................. + // add v9.4S, v1.4S, v10.4S // ............................................................................................................................*............... + // mls v2.4S, v16.4S, v8.4S // ...............................................................................................*............................................ + // mls v14.4S, v24.4S, v8.S[0] // .......................................................................................................*.................................... + // mul v13.4S, v12.4S, v20.S[0] // ..................................................................................................................................*......... + // sub v15.4S, v31.4S, v2.4S // ................................................................................................................................*........... + // add v2.4S, v31.4S, v2.4S // .................................................................................................................................*.......... + // mls v13.4S, v0.4S, v8.S[0] // ...................................................................................................................................*........ + // sub v31.4S, v6.4S, v14.4S // .....................................................................................................................................*...... + // add v27.4S, v6.4S, v14.4S // ......................................................................................................................................*..... + // mul v23.4S, v19.4S, v20.S[0] // .......................................................................................................................................*.... + // str q2, [x1, #-48] // ....................................................................................................................................*....... + // sqrdmulh v11.4S, v15.4S, v20.S[1] // ........................................................................................................................................*... + // str q13, [x2], #(16*4) // ..........................................................................................................................................*. + // str q27, [x1, #-16] // .........................................................................................................................................*.. + // sqrdmulh v4.4S, v31.4S, v20.S[1] // ...........................................................................................................................................* sub count, count, #1 layer45678_start: - ldr q14, [x5, #80] // .......e........................................................................................................................................ - sub v17.4S, v3.4S, v28.4S // .......................................................................................................................*........................ - add x1, x1, #64 // ..............................................................................................................................................*. - mul v21.4S, v21.4S, v18.S[0] // ...................................................................................................................................*............ - ldr q2, [x5, #16] // ...e............................................................................................................................................ - ldr q3, [x5], #(12*16) // ..e............................................................................................................................................. - ld4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1] // e............................................................................................................................................... - ldr q28, [x5, #-160] // ....e........................................................................................................................................... - str q31, [x2, #-32] // ............................................................................................................................................*... - ldr q6, [x5, #-144] // .....e.......................................................................................................................................... - mul v31.4S, v17.4S, v18.S[0] // .........................................................................................................................*...................... - // gap // ................................................................................................................................................ - ldr q15, [x5, #-128] // ......e......................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v17.4S, v17.4S, v18.S[1] // ..........................................................................................................................*..................... - ldr q29, [x5, #-64] // ..............................e................................................................................................................. - // gap // ................................................................................................................................................ - ldr q1, [x5, #-96] // ............................e................................................................................................................... - sub v13.4S, v26.4S, v27.4S // .............e.................................................................................................................................. - ldr q16, [x5, #-80] // .............................e.................................................................................................................. - mls v21.4S, v9.4S, v8.S[0] // .....................................................................................................................................*.......... - add v26.4S, v26.4S, v27.4S // ..............e................................................................................................................................. - ldr q9, [x5, #-48] // ...............................e................................................................................................................ - sub v0.4S, v24.4S, v25.4S // ........e....................................................................................................................................... - ldr q30, [x5, #-32] // ................................e............................................................................................................... - // gap // ................................................................................................................................................ - sqrdmulh v14.4S, v13.4S, v14.4S // ................e............................................................................................................................... - add v24.4S, v24.4S, v25.4S // .........e...................................................................................................................................... - ldr q7, [x5, #-16] // .................................e.............................................................................................................. - ldr q18, [x4], #64 // ......................................................................e......................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v28.4S, v0.4S, v28.4S // ..........e..................................................................................................................................... - ldr q25, [x4, #-48] // .......................................................................e........................................................................ - // gap // ................................................................................................................................................ - str q21, [x2, #-16] // .............................................................................................................................................*.. - sub v21.4S, v24.4S, v26.4S // ..................e............................................................................................................................. - ldr q12, [x4, #-32] // ........................................................................e....................................................................... - sqrdmulh v6.4S, v0.4S, v6.4S // ...........e.................................................................................................................................... - add v24.4S, v24.4S, v26.4S // ...................e............................................................................................................................ - ldr q26, [x4, #-16] // .........................................................................e...................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v15.4S, v13.4S, v15.4S // ...............e................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v15.4S, v14.4S, v8.S[0] // .................e.............................................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v28.4S, v6.4S, v8.S[0] // ............e................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v31.4S, v17.4S, v8.S[0] // ...........................................................................................................................*.................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v17.4S, v21.4S, v3.4S // ....................e........................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v14.4S, v28.4S, v15.4S // .......................e........................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v6.4S, v21.4S, v2.4S // .....................e.......................................................................................................................... - add v28.4S, v28.4S, v15.4S // ........................e....................................................................................................................... - // gap // ................................................................................................................................................ - str q31, [x2, #-48] // ...........................................................................................................................................*.... - add x2, x2, #64 // ...............................................................................................................................................* - // gap // ................................................................................................................................................ - sqrdmulh v2.4S, v14.4S, v2.4S // ..........................e..................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ld4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x2] // .e.............................................................................................................................................. - trn1 v31.4S, v24.4S, v28.4S // ......................................................e......................................................................................... - // gap // ................................................................................................................................................ - mul v14.4S, v14.4S, v3.4S // .........................e...................................................................................................................... - trn2 v3.4S, v24.4S, v28.4S // .......................................................e........................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v17.4S, v6.4S, v8.S[0] // ......................e......................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v24.4S, v19.4S, v20.4S // ..................................e............................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v14.4S, v2.4S, v8.S[0] // ...........................e.................................................................................................................... - add v2.4S, v19.4S, v20.4S // ...................................e............................................................................................................ - // gap // ................................................................................................................................................ - sub v28.4S, v21.4S, v22.4S // .......................................e........................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v6.4S, v24.4S, v29.4S // ....................................e........................................................................................................... - add v21.4S, v21.4S, v22.4S // ........................................e....................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v24.4S, v24.4S, v9.4S // .....................................e.......................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v15.4S, v2.4S, v21.4S // ............................................e................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v29.4S, v28.4S, v7.4S // ..........................................e..................................................................................................... - trn1 v13.4S, v17.4S, v14.4S // ........................................................e....................................................................................... - // gap // ................................................................................................................................................ - trn2 v17.4S, v17.4S, v14.4S // .........................................................e...................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v14.4S, v28.4S, v30.4S // .........................................e...................................................................................................... - add v21.4S, v2.4S, v21.4S // .............................................e.................................................................................................. - // gap // ................................................................................................................................................ - trn1 v2.2D, v31.2D, v13.2D // ............................................................e................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v6.4S, v24.4S, v8.S[0] // ......................................e......................................................................................................... - trn1 v24.2D, v3.2D, v17.2D // .............................................................e.................................................................................. - // gap // ................................................................................................................................................ - trn2 v28.2D, v31.2D, v13.2D // ..........................................................e..................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - trn2 v17.2D, v3.2D, v17.2D // ...........................................................e.................................................................................... - mls v14.4S, v29.4S, v8.S[0] // ...........................................e.................................................................................................... - // gap // ................................................................................................................................................ - sub v3.4S, v2.4S, v24.4S // ..........................................................................e..................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v2.4S, v2.4S, v24.4S // ...........................................................................e.................................................................... - mul v24.4S, v15.4S, v1.4S // ..............................................e................................................................................................. - // gap // ................................................................................................................................................ - sub v31.4S, v28.4S, v17.4S // ...............................................................................e................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v17.4S, v28.4S, v17.4S // ................................................................................e............................................................... - mul v28.4S, v3.4S, v25.S[2] // ............................................................................e................................................................... - // gap // ................................................................................................................................................ - sub v29.4S, v6.4S, v14.4S // .................................................e.............................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v15.4S, v15.4S, v16.4S // ...............................................e................................................................................................ - add v14.4S, v6.4S, v14.4S // ..................................................e............................................................................................. - // gap // ................................................................................................................................................ - sub v6.4S, v2.4S, v17.4S // ..............................................................................................e................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v1.4S, v29.4S, v1.4S // ...................................................e............................................................................................ - add v17.4S, v2.4S, v17.4S // ...............................................................................................e................................................ - // gap // ................................................................................................................................................ - trn1 v2.4S, v21.4S, v14.4S // ..............................................................e................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v29.4S, v29.4S, v16.4S // ....................................................e........................................................................................... - trn2 v21.4S, v21.4S, v14.4S // ...............................................................e................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v24.4S, v15.4S, v8.S[0] // ................................................e............................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v14.4S, v3.4S, v25.S[3] // .............................................................................e.................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v1.4S, v29.4S, v8.S[0] // .....................................................e.......................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v3.4S, v31.4S, v12.S[0] // .................................................................................e.............................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v28.4S, v14.4S, v8.S[0] // ..............................................................................e................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - trn1 v14.4S, v24.4S, v1.4S // ................................................................e............................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v31.4S, v31.4S, v12.S[1] // ..................................................................................e............................................................. - trn2 v24.4S, v24.4S, v1.4S // .................................................................e.............................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v1.4S, v6.4S, v18.S[3] // .................................................................................................e.............................................. - trn2 v15.2D, v2.2D, v14.2D // ..................................................................e............................................................................. - // gap // ................................................................................................................................................ - trn1 v14.2D, v2.2D, v14.2D // ....................................................................e........................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v2.4S, v6.4S, v18.S[2] // ................................................................................................e............................................... - trn1 v6.2D, v21.2D, v24.2D // .....................................................................e.......................................................................... - // gap // ................................................................................................................................................ - trn2 v21.2D, v21.2D, v24.2D // ...................................................................e............................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v3.4S, v31.4S, v8.S[0] // ...................................................................................e............................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v24.4S, v14.4S, v6.4S // ....................................................................................e........................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v2.4S, v1.4S, v8.S[0] // ..................................................................................................e............................................. - add v14.4S, v14.4S, v6.4S // .....................................................................................e.......................................................... - // gap // ................................................................................................................................................ - sub v6.4S, v15.4S, v21.4S // .........................................................................................e...................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v21.4S, v15.4S, v21.4S // ..........................................................................................e..................................................... - mul v31.4S, v24.4S, v12.S[2] // ......................................................................................e......................................................... - // gap // ................................................................................................................................................ - sub v1.4S, v28.4S, v3.4S // ...................................................................................................e............................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v24.4S, v24.4S, v12.S[3] // .......................................................................................e........................................................ - add v3.4S, v28.4S, v3.4S // ....................................................................................................e........................................... - // gap // ................................................................................................................................................ - sub v28.4S, v14.4S, v21.4S // ........................................................................................................e....................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v21.4S, v14.4S, v21.4S // .........................................................................................................e...................................... - sqrdmulh v14.4S, v6.4S, v26.S[1] // ............................................................................................e................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v6.4S, v6.4S, v26.S[0] // ...........................................................................................e.................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v26.4S, v17.4S, v21.4S // ..................................................................................................................e............................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v17.4S, v17.4S, v21.4S // ...................................................................................................................e............................ - mls v31.4S, v24.4S, v8.S[0] // ........................................................................................e....................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v6.4S, v14.4S, v8.S[0] // .............................................................................................e.................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q17, [x1], #(16*4) // ......................................................................................................................................e......... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v17.4S, v1.4S, v18.S[2] // .....................................................................................................e.......................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v21.4S, v28.4S, v25.S[0] // ..........................................................................................................e..................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v14.4S, v31.4S, v6.4S // .............................................................................................................e.................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v24.4S, v28.4S, v25.S[1] // ...........................................................................................................e.................................... - add v28.4S, v31.4S, v6.4S // ..............................................................................................................e................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v6.4S, v14.4S, v25.S[0] // ...............................................................................................................e................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v31.4S, v3.4S, v28.4S // ........................................................................................................................e....................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v14.4S, v14.4S, v25.S[1] // ................................................................................................................e............................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v1.4S, v1.4S, v18.S[3] // ......................................................................................................e......................................... - str q31, [x1, #-48] // .......................................................................................................................................e........ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v21.4S, v24.4S, v8.S[0] // ............................................................................................................e................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v6.4S, v14.4S, v8.S[0] // .................................................................................................................e.............................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v17.4S, v1.4S, v8.S[0] // .......................................................................................................e........................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v14.4S, v2.4S, v21.4S // ............................................................................................................................e................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v21.4S, v2.4S, v21.4S // .............................................................................................................................e.................. - sqrdmulh v2.4S, v26.4S, v18.S[1] // .....................................................................................................................e.......................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v24.4S, v26.4S, v18.S[0] // ....................................................................................................................e........................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q21, [x1, #-32] // ........................................................................................................................................e....... - sub v21.4S, v17.4S, v6.4S // .................................................................................................................................e.............. - // gap // ................................................................................................................................................ - add v17.4S, v17.4S, v6.4S // ..................................................................................................................................e............. - sqrdmulh v6.4S, v14.4S, v18.S[1] // ...............................................................................................................................e................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v24.4S, v2.4S, v8.S[0] // ......................................................................................................................e......................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q17, [x1, #-16] // .........................................................................................................................................e...... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v31.4S, v14.4S, v18.S[0] // ..............................................................................................................................e................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v31.4S, v6.4S, v8.S[0] // ................................................................................................................................e............... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q24, [x2], #(16*4) // ..........................................................................................................................................e..... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v9.4S, v21.4S, v18.S[1] // ....................................................................................................................................e........... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ + ldr q21, [x4, #48] // .........................................................................e.............................................................................. + ldr q24, [x5, #64] // ......e................................................................................................................................................. + // gap // ........................................................................................................................................................ + ldr q3, [x5, #96] // ............................e........................................................................................................................... + ldr q10, [x5, #176] // .................................e...................................................................................................................... + mul v30.4S, v31.4S, v20.S[0] // ...........................................................................................................................................*............ + ldr q7, [x5, #32] // ....e................................................................................................................................................... + str q9, [x1, #-32] // ................................................................................................................................................*....... + add x1, x1, #64 // ......................................................................................................................................................*. + ldr q12, [x5, #144] // ...............................e........................................................................................................................ + sqrdmulh v0.4S, v19.4S, v20.S[1] // .......................................................................................................................................*................ + // gap // ........................................................................................................................................................ + ldr q22, [x5, #128] // ..............................e......................................................................................................................... + ld4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1] // e....................................................................................................................................................... + // gap // ........................................................................................................................................................ + ldr q29, [x5, #48] // .....e.................................................................................................................................................. + mul v1.4S, v15.4S, v20.S[0] // .................................................................................................................................*...................... + ldr q20, [x5, #80] // .......e................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q18, [x5, #16] // ...e.................................................................................................................................................... + mls v30.4S, v4.4S, v8.S[0] // .............................................................................................................................................*.......... + // gap // ........................................................................................................................................................ + sub v9.4S, v27.4S, v28.4S // .............e.......................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v27.4S, v27.4S, v28.4S // ..............e......................................................................................................................................... + mls v1.4S, v11.4S, v8.S[0] // ...................................................................................................................................*.................... + // gap // ........................................................................................................................................................ + add v19.4S, v25.4S, v26.4S // .........e.............................................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v26.4S, v25.4S, v26.4S // ........e............................................................................................................................................... + mul v6.4S, v9.4S, v24.4S // ...............e........................................................................................................................................ + // gap // ........................................................................................................................................................ + str q30, [x2, #-16] // .....................................................................................................................................................*.. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v5.4S, v9.4S, v20.4S // ................e....................................................................................................................................... + add v31.4S, v19.4S, v27.4S // ...................e.................................................................................................................................... + // gap // ........................................................................................................................................................ + str q1, [x2, #-48] // ...................................................................................................................................................*.... + sub v25.4S, v19.4S, v27.4S // ..................e..................................................................................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v15.4S, v26.4S, v29.4S // ...........e............................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v2.4S, v26.4S, v7.4S // ..........e............................................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q14, [x5], #(12*16) // ..e..................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v6.4S, v5.4S, v8.S[0] // .................e...................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v2.4S, v15.4S, v8.S[0] // ............e........................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v23.4S, v0.4S, v8.S[0] // ........................................................................................................................................*............... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v15.4S, v25.4S, v18.4S // .....................e.................................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v16.4S, v2.4S, v6.4S // .......................e................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v13.4S, v25.4S, v14.4S // ....................e................................................................................................................................... + add v28.4S, v2.4S, v6.4S // ........................e............................................................................................................................... + // gap // ........................................................................................................................................................ + str q23, [x2, #-32] // ....................................................................................................................................................*... + add x2, x2, #64 // .......................................................................................................................................................* + // gap // ........................................................................................................................................................ + sqrdmulh v11.4S, v16.4S, v18.4S // ..........................e............................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v23.4S, v31.4S, v28.4S // .......................................................e................................................................................................ + ld4 {v4.4S, v5.4S, v6.4S, v7.4S}, [x2] // .e...................................................................................................................................................... + // gap // ........................................................................................................................................................ + mls v13.4S, v15.4S, v8.S[0] // ......................e................................................................................................................................. + trn1 v17.4S, v31.4S, v28.4S // ......................................................e................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v0.4S, v16.4S, v14.4S // .........................e.............................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v26.4S, v4.4S, v5.4S // ..................................e..................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v0.4S, v11.4S, v8.S[0] // ...........................e............................................................................................................................ + add v31.4S, v6.4S, v7.4S // ........................................e............................................................................................................... + // gap // ........................................................................................................................................................ + add v24.4S, v4.4S, v5.4S // ...................................e.................................................................................................................... + ldr q11, [x5, #-32] // ................................e....................................................................................................................... + ldr q18, [x5, #-80] // .............................e.......................................................................................................................... + mul v9.4S, v26.4S, v22.4S // ....................................e................................................................................................................... + ldr q5, [x4, #32] // ........................................................................e............................................................................... + // gap // ........................................................................................................................................................ + sub v16.4S, v6.4S, v7.4S // .......................................e................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v26.4S, v12.4S // .....................................e.................................................................................................................. + sub v15.4S, v24.4S, v31.4S // ............................................e........................................................................................................... + // gap // ........................................................................................................................................................ + trn1 v26.4S, v13.4S, v0.4S // ........................................................e............................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v24.4S, v24.4S, v31.4S // .............................................e.......................................................................................................... + sqrdmulh v27.4S, v16.4S, v10.4S // ..........................................e............................................................................................................. + // gap // ........................................................................................................................................................ + trn2 v6.4S, v13.4S, v0.4S // .........................................................e.............................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v25.4S, v15.4S, v18.4S // ...............................................e........................................................................................................ + trn1 v19.2D, v17.2D, v26.2D // ............................................................e........................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v9.4S, v30.4S, v8.S[0] // ......................................e................................................................................................................. + trn1 v28.2D, v23.2D, v6.2D // .............................................................e.......................................................................................... + ldr q7, [x4, #16] // .......................................................................e................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v0.4S, v15.4S, v3.4S // ..............................................e......................................................................................................... + trn2 v15.2D, v23.2D, v6.2D // ...........................................................e............................................................................................ + // gap // ........................................................................................................................................................ + sub v4.4S, v19.4S, v28.4S // ..........................................................................e............................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v23.2D, v17.2D, v26.2D // ..........................................................e............................................................................................. + mls v0.4S, v25.4S, v8.S[0] // ................................................e....................................................................................................... + // gap // ........................................................................................................................................................ + add v13.4S, v19.4S, v28.4S // ...........................................................................e............................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v26.4S, v4.4S, v7.S[3] // .............................................................................e.......................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v1.4S, v23.4S, v15.4S // ................................................................................e....................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v31.4S, v16.4S, v11.4S // .........................................e.............................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v29.4S, v23.4S, v15.4S // ...............................................................................e........................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v31.4S, v27.4S, v8.S[0] // ...........................................e............................................................................................................ + sub v27.4S, v13.4S, v1.4S // ..............................................................................................e......................................................... + ldr q20, [x4], #64 // ......................................................................e................................................................................. + add v25.4S, v13.4S, v1.4S // ...............................................................................................e........................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v6.4S, v29.4S, v5.S[1] // ..................................................................................e..................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v28.4S, v4.4S, v7.S[2] // ............................................................................e........................................................................... + srshr v15.4S, v25.4S, #23 // ..................................................................................................................e..................................... + // gap // ........................................................................................................................................................ + sub v17.4S, v9.4S, v31.4S // .................................................e...................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v13.4S, v29.4S, v5.S[0] // .................................................................................e...................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v13.4S, v6.4S, v8.S[0] // ...................................................................................e.................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v16.4S, v17.4S, v18.4S // ....................................................e................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v10.4S, v9.4S, v31.4S // ..................................................e..................................................................................................... + sqrdmulh v23.4S, v27.4S, v20.S[3] // .................................................................................................e...................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v28.4S, v26.4S, v8.S[0] // ..............................................................................e......................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v30.4S, v24.4S, v10.4S // ..............................................................e......................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v12.4S, v17.4S, v3.4S // ...................................................e.................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v12.4S, v16.4S, v8.S[0] // .....................................................e.................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v22.4S, v28.4S, v13.4S // ...................................................................................................e.................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v25.4S, v15.4S, v8.4S // ...................................................................................................................e.................................... + trn2 v15.4S, v24.4S, v10.4S // ...............................................................e........................................................................................ + // gap // ........................................................................................................................................................ + add v31.4S, v28.4S, v13.4S // ....................................................................................................e................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v1.4S, v27.4S, v20.S[2] // ................................................................................................e....................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v2.4S, v0.4S, v12.4S // ................................................................e....................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v1.4S, v23.4S, v8.S[0] // ..................................................................................................e..................................................... + trn2 v10.4S, v0.4S, v12.4S // .................................................................e...................................................................................... + // gap // ........................................................................................................................................................ + srshr v28.4S, v31.4S, #23 // ....................................................................................................................e................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v11.2D, v30.2D, v2.2D // ..................................................................e..................................................................................... + sqrdmulh v19.4S, v22.4S, v20.S[3] // ......................................................................................................e................................................. + // gap // ........................................................................................................................................................ + trn2 v12.2D, v15.2D, v10.2D // ...................................................................e.................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v14.2D, v30.2D, v2.2D // ....................................................................e................................................................................... + mul v6.4S, v22.4S, v20.S[2] // .....................................................................................................e.................................................. + // gap // ........................................................................................................................................................ + trn1 v2.2D, v15.2D, v10.2D // .....................................................................e.................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v31.4S, v28.4S, v8.4S // .....................................................................................................................e.................................. + sub v13.4S, v11.4S, v12.4S // .........................................................................................e.............................................................. + // gap // ........................................................................................................................................................ + add v23.4S, v11.4S, v12.4S // ..........................................................................................e............................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v6.4S, v19.4S, v8.S[0] // .......................................................................................................e................................................ + sub v17.4S, v14.4S, v2.4S // ....................................................................................e................................................................... + // gap // ........................................................................................................................................................ + add v26.4S, v14.4S, v2.4S // .....................................................................................e.................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v24.4S, v13.4S, v21.S[0] // ...........................................................................................e............................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v19.4S, v17.4S, v5.S[3] // .......................................................................................e................................................................ + add v16.4S, v26.4S, v23.4S // .........................................................................................................e.............................................. + // gap // ........................................................................................................................................................ + sub v18.4S, v26.4S, v23.4S // ........................................................................................................e............................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v23.4S, v13.4S, v21.S[1] // ............................................................................................e........................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + srshr v27.4S, v16.4S, #23 // ......................................................................................................................e................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v17.4S, v17.4S, v5.S[2] // ......................................................................................e................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v17.4S, v19.4S, v8.S[0] // ........................................................................................e............................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v2.4S, v18.4S, v7.S[1] // ...........................................................................................................e............................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v24.4S, v23.4S, v8.S[0] // .............................................................................................e.......................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v10.4S, v18.4S, v7.S[0] // ..........................................................................................................e............................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v16.4S, v27.4S, v8.4S // .......................................................................................................................e................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v19.4S, v17.4S, v24.4S // .............................................................................................................e.......................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v10.4S, v2.4S, v8.S[0] // ............................................................................................................e........................................... + add v2.4S, v17.4S, v24.4S // ..............................................................................................................e......................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v14.4S, v19.4S, v7.S[0] // ...............................................................................................................e........................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v12.4S, v25.4S, v16.4S // ..........................................................................................................................e............................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v24.4S, v19.4S, v7.S[1] // ................................................................................................................e....................................... + add v9.4S, v25.4S, v16.4S // ...........................................................................................................................e............................ + // gap // ........................................................................................................................................................ + srshr v16.4S, v2.4S, #23 // ........................................................................................................................e............................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v0.4S, v12.4S, v20.S[1] // .............................................................................................................................e.......................... + sub v19.4S, v1.4S, v10.4S // ....................................................................................................................................e................... + // gap // ........................................................................................................................................................ + str q9, [x1], #(16*4) // ..............................................................................................................................................e......... + add v9.4S, v1.4S, v10.4S // .....................................................................................................................................e.................. + // gap // ........................................................................................................................................................ + mls v2.4S, v16.4S, v8.4S // .........................................................................................................................e.............................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v14.4S, v24.4S, v8.S[0] // .................................................................................................................e...................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v13.4S, v12.4S, v20.S[0] // ............................................................................................................................e........................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v15.4S, v31.4S, v2.4S // ...............................................................................................................................e........................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v2.4S, v31.4S, v2.4S // ................................................................................................................................e....................... + mls v13.4S, v0.4S, v8.S[0] // ..............................................................................................................................e......................... + // gap // ........................................................................................................................................................ + sub v31.4S, v6.4S, v14.4S // .........................................................................................................................................e.............. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v27.4S, v6.4S, v14.4S // ..........................................................................................................................................e............. + mul v23.4S, v19.4S, v20.S[0] // ......................................................................................................................................e................. + // gap // ........................................................................................................................................................ + str q2, [x1, #-48] // ...............................................................................................................................................e........ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v11.4S, v15.4S, v20.S[1] // ..................................................................................................................................e..................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q13, [x2], #(16*4) // ..................................................................................................................................................e..... + str q27, [x1, #-16] // .................................................................................................................................................e...... + // gap // ........................................................................................................................................................ + sqrdmulh v4.4S, v31.4S, v20.S[1] // ............................................................................................................................................e........... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ // original source code - // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // ......e.........................................................................................................................................|.....e..................................... - // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // .............................................e..................................................................................................|........................................... - // ldr q0, [x5], #(12*16) // .....e..........................................................................................................................................|....e...................................... - // ldr q4, [x5, #(-12*16 + 1*16)] // ....e...........................................................................................................................................|...e....................................... - // ldr q1, [x5, #(-12*16 + 2*16)] // .......e........................................................................................................................................|......e.................................... - // ldr q5, [x5, #(-12*16 + 3*16)] // .........e......................................................................................................................................|........e.................................. - // ldr q2, [x5, #(-12*16 + 4*16)] // ...........e....................................................................................................................................|..........e................................ - // ldr q6, [x5, #(-12*16 + 5*16)] // e...............................................................................................................................................e........................................... - // sub v24.4s, v9.4s, v10.4s // ....................e...........................................................................................................................|...................e....................... - // add v9.4s, v9.4s, v10.4s // .......................e........................................................................................................................|......................e.................... - // mul v10.4s, v24.4s, v1.4s // ..........................e.....................................................................................................................|.........................e................. - // sqrdmulh v24.4s, v24.4s, v5.4s // ...............................e................................................................................................................|..............................e............ - // mls v10.4s, v24.4s, v8.s[0] // ....................................e...........................................................................................................|...................................e....... - // sub v24.4s, v11.4s, v12.4s // ...............e................................................................................................................................|..............e............................ - // add v11.4s, v11.4s, v12.4s // ..................e.............................................................................................................................|.................e......................... - // mul v12.4s, v24.4s, v2.4s // ..................................e.............................................................................................................|.................................e......... - // sqrdmulh v24.4s, v24.4s, v6.4s // ......................e.........................................................................................................................|.....................e..................... - // mls v12.4s, v24.4s, v8.s[0] // ...................................e............................................................................................................|..................................e........ - // sub v24.4s, v9.4s, v11.4s // .............................e..................................................................................................................|............................e.............. - // add v9.4s, v9.4s, v11.4s // ................................e...............................................................................................................|...............................e........... - // mul v11.4s, v24.4s, v0.4s // ......................................e.........................................................................................................|.....................................e..... - // sqrdmulh v24.4s, v24.4s, v4.4s // ........................................e.......................................................................................................|.......................................e... - // mls v11.4s, v24.4s, v8.s[0] // .................................................e..............................................................................................|........................................... - // sub v24.4s, v10.4s, v12.4s // .......................................e........................................................................................................|......................................e.... - // add v10.4s, v10.4s, v12.4s // .........................................e......................................................................................................|........................................e.. - // mul v12.4s, v24.4s, v0.4s // ...............................................e................................................................................................|........................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ............................................e...................................................................................................|........................................... - // mls v12.4s, v24.4s, v8.s[0] // ...................................................e............................................................................................|........................................... - // ldr q0, [x5, #(-12*16 + 6*16)] // ..............e.................................................................................................................................|.............e............................. - // ldr q4, [x5, #(-12*16 + 7*16)] // ................e...............................................................................................................................|...............e........................... - // ldr q1, [x5, #(-12*16 + 8*16)] // .............e..................................................................................................................................|............e.............................. - // ldr q5, [x5, #(-12*16 + 9*16)] // ...................e............................................................................................................................|..................e........................ - // ldr q2, [x5, #(-12*16 + 10*16)] // .....................e..........................................................................................................................|....................e...................... - // ldr q6, [x5, #(-12*16 + 11*16)] // ........................e.......................................................................................................................|.......................e................... - // sub v24.4s, v13.4s, v14.4s // ..................................................e.............................................................................................|........................................... - // add v13.4s, v13.4s, v14.4s // ....................................................e...........................................................................................|........................................... - // mul v14.4s, v24.4s, v1.4s // ......................................................e.........................................................................................|........................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // ........................................................e.......................................................................................|........................................... - // mls v14.4s, v24.4s, v8.s[0] // ................................................................e...............................................................................|........................................... - // sub v24.4s, v15.4s, v16.4s // .....................................................e..........................................................................................|........................................... - // add v15.4s, v15.4s, v16.4s // .......................................................e........................................................................................|........................................... - // mul v16.4s, v24.4s, v2.4s // .............................................................e..................................................................................|........................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ..........................................................e.....................................................................................|........................................... - // mls v16.4s, v24.4s, v8.s[0] // ....................................................................e...........................................................................|........................................... - // sub v24.4s, v13.4s, v15.4s // .........................................................e......................................................................................|........................................... - // add v13.4s, v13.4s, v15.4s // ..............................................................e.................................................................................|........................................... - // mul v15.4s, v24.4s, v0.4s // .......................................................................e........................................................................|........................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ............................................................................e...................................................................|........................................... - // mls v15.4s, v24.4s, v8.s[0] // ....................................................................................e...........................................................|........................................... - // sub v24.4s, v14.4s, v16.4s // ...........................................................................e....................................................................|........................................... - // add v14.4s, v14.4s, v16.4s // .............................................................................e..................................................................|........................................... - // mul v16.4s, v24.4s, v0.4s // ...............................................................................e................................................................|........................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ..................................................................................e.............................................................|........................................... - // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................e.........................................................|........................................... - // trn1 v25.4s, v9.4s, v10.4s // ..............................................e.................................................................................................|........................................... - // trn2 v26.4s, v9.4s, v10.4s // ................................................e...............................................................................................|........................................... - // trn1 v27.4s, v11.4s, v12.4s // ...........................................................e....................................................................................|........................................... - // trn2 v28.4s, v11.4s, v12.4s // ............................................................e...................................................................................|........................................... - // trn2 v11.2d, v25.2d, v27.2d // ..................................................................e.............................................................................|........................................... - // trn2 v12.2d, v26.2d, v28.2d // ...................................................................e............................................................................|........................................... - // trn1 v9.2d, v25.2d, v27.2d // ...............................................................e................................................................................|........................................... - // trn1 v10.2d, v26.2d, v28.2d // .................................................................e..............................................................................|........................................... - // trn1 v25.4s, v13.4s, v14.4s // .................................................................................e..............................................................|........................................... - // trn2 v26.4s, v13.4s, v14.4s // ...................................................................................e............................................................|........................................... - // trn1 v27.4s, v15.4s, v16.4s // .........................................................................................e......................................................|........................................... - // trn2 v28.4s, v15.4s, v16.4s // ...........................................................................................e....................................................|........................................... - // trn2 v15.2d, v25.2d, v27.2d // .............................................................................................e..................................................|........................................... - // trn2 v16.2d, v26.2d, v28.2d // .................................................................................................e..............................................|........................................... - // trn1 v13.2d, v25.2d, v27.2d // ..............................................................................................e.................................................|........................................... - // trn1 v14.2d, v26.2d, v28.2d // ................................................................................................e...............................................|........................................... - // ldr q0, [x4], #64 // .........................e......................................................................................................................|........................e.................. - // ldr q1, [x4, #(-64 + 16)] // ...........................e....................................................................................................................|..........................e................ - // ldr q2, [x4, #(-64 + 32)] // ..............................e.................................................................................................................|.............................e............. - // ldr q3, [x4, #(-64 + 48)] // .................................e..............................................................................................................|................................e.......... - // sub v24.4s, v9.4s, v10.4s // .....................................................................e..........................................................................|........................................... - // add v9.4s, v9.4s, v10.4s // ......................................................................e.........................................................................|........................................... - // mul v10.4s, v24.4s, v1.s[2] // ..........................................................................e.....................................................................|........................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................................................................................e..........................................................|........................................... - // mls v10.4s, v24.4s, v8.s[0] // ........................................................................................e.......................................................|........................................... - // sub v24.4s, v11.4s, v12.4s // ........................................................................e.......................................................................|........................................... - // add v11.4s, v11.4s, v12.4s // .........................................................................e......................................................................|........................................... - // mul v12.4s, v24.4s, v2.s[0] // .......................................................................................e........................................................|........................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........................................................................................e.....................................................|........................................... - // mls v12.4s, v24.4s, v8.s[0] // ..................................................................................................e.............................................|........................................... - // sub v24.4s, v13.4s, v14.4s // ...................................................................................................e............................................|........................................... - // add v13.4s, v13.4s, v14.4s // .....................................................................................................e..........................................|........................................... - // mul v14.4s, v24.4s, v2.s[2] // ........................................................................................................e.......................................|........................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..........................................................................................................e.....................................|........................................... - // mls v14.4s, v24.4s, v8.s[0] // ..................................................................................................................e.............................|........................................... - // sub v24.4s, v15.4s, v16.4s // ......................................................................................................e.........................................|........................................... - // add v15.4s, v15.4s, v16.4s // .......................................................................................................e........................................|........................................... - // mul v16.4s, v24.4s, v3.s[0] // ...............................................................................................................e................................|........................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ..............................................................................................................e.................................|........................................... - // mls v16.4s, v24.4s, v8.s[0] // ...................................................................................................................e............................|........................................... - // sub v24.4s, v9.4s, v11.4s // ..............................................................................e.................................................................|........................................... - // add v9.4s, v9.4s, v11.4s // ................................................................................e...............................................................|........................................... - // mul v11.4s, v24.4s, v0.s[2] // ...............................................................................................e................................................|........................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ............................................................................................e...................................................|........................................... - // mls v11.4s, v24.4s, v8.s[0] // ....................................................................................................e...........................................|........................................... - // sub v24.4s, v10.4s, v12.4s // .........................................................................................................e......................................|........................................... - // add v10.4s, v10.4s, v12.4s // ...........................................................................................................e....................................|........................................... - // mul v12.4s, v24.4s, v0.s[2] // .....................................................................................................................e..........................|........................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................................................................................................e..................|........................................... - // mls v12.4s, v24.4s, v8.s[0] // .................................................................................................................................e..............|........................................... - // sub v24.4s, v13.4s, v15.4s // ............................................................................................................e...................................|........................................... - // add v13.4s, v13.4s, v15.4s // .............................................................................................................e..................................|........................................... - // mul v15.4s, v24.4s, v1.s[0] // ......................................................................................................................e.........................|........................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................................................................................................................e.......................|........................................... - // mls v15.4s, v24.4s, v8.s[0] // ...............................................................................................................................e................|........................................... - // sub v24.4s, v14.4s, v16.4s // .......................................................................................................................e........................|........................................... - // add v14.4s, v14.4s, v16.4s // .........................................................................................................................e......................|........................................... - // mul v16.4s, v24.4s, v1.s[0] // ..........................................................................................................................e.....................|........................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ............................................................................................................................e...................|........................................... - // mls v16.4s, v24.4s, v8.s[0] // ................................................................................................................................e...............|........................................... - // sub v24.4s, v9.4s, v13.4s // ................................................................................................................e...............................|........................................... - // add v9.4s, v9.4s, v13.4s // .................................................................................................................e..............................|........................................... - // mul v13.4s, v24.4s, v0.s[0] // .....................................................................................................................................e..........|........................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ....................................................................................................................................e...........|........................................... - // mls v13.4s, v24.4s, v8.s[0] // ..........................................................................................................................................e.....|........................................... - // sub v24.4s, v10.4s, v14.4s // .*..............................................................................................................................................|*.......................................... - // add v10.4s, v10.4s, v14.4s // ...........................................................................................................................e....................|........................................... - // mul v14.4s, v24.4s, v0.s[0] // ..........*.....................................................................................................................................|.........*................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............*...................................................................................................................................|...........*............................... - // mls v14.4s, v24.4s, v8.s[0] // .....................................*..........................................................................................................|....................................*...... - // sub v24.4s, v11.4s, v15.4s // ..................................................................................................................................e.............|........................................... - // add v11.4s, v11.4s, v15.4s // ...................................................................................................................................e............|........................................... - // mul v15.4s, v24.4s, v0.s[0] // ............................................................................................................................................e...|........................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................................................................................................................................e......|........................................... - // mls v15.4s, v24.4s, v8.s[0] // .............................................................................................................................................e..|........................................... - // sub v24.4s, v12.4s, v16.4s // .......................................................................................................................................e........|........................................... - // add v12.4s, v12.4s, v16.4s // ........................................................................................................................................e.......|........................................... - // mul v16.4s, v24.4s, v0.s[0] // ...*............................................................................................................................................|..*........................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................................................................................................e|........................................... - // mls v16.4s, v24.4s, v8.s[0] // .................*..............................................................................................................................|................*.......................... - // str q9, [x1], #(16*4) // ....................................................................................................................e...........................|........................................... - // str q10, [x1, #(-16*4 + 1*16)] // ..............................................................................................................................e.................|........................................... - // str q11, [x1, #(-16*4 + 2*16)] // ......................................................................................................................................e.........|........................................... - // str q12, [x1, #(-16*4 + 3*16)] // ...........................................................................................................................................e....|........................................... - // str q13, [x2], #(16*4) // ..............................................................................................................................................e.|........................................... - // str q14, [x2, #(-16*4 + 1*16)] // ..........................................*.....................................................................................................|.........................................*. - // str q15, [x2, #(-16*4 + 2*16)] // ........*.......................................................................................................................................|.......*................................... - // str q16, [x2, #(-16*4 + 3*16)] // ............................*...................................................................................................................|...........................*............... - // add x1, x1, #64 // ..*.............................................................................................................................................|.*......................................... - // add x2, x2, #64 // ...........................................*....................................................................................................|..........................................* + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // ...........e............................................................................................................................................|..........e............................ + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // ..........................................e.............................................................................................................|....................................... + // ldr q0, [x5], #(12*16) // ..............................e.........................................................................................................................|.............................e......... + // ldr q4, [x5, #(-12*16 + 1*16)] // ...............e........................................................................................................................................|..............e........................ + // ldr q1, [x5, #(-12*16 + 2*16)] // .....e..................................................................................................................................................|....e.................................. + // ldr q5, [x5, #(-12*16 + 3*16)] // ............e...........................................................................................................................................|...........e........................... + // ldr q2, [x5, #(-12*16 + 4*16)] // .e......................................................................................................................................................|e...................................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ..............e.........................................................................................................................................|.............e......................... + // sub v24.4s, v9.4s, v10.4s // .....................e..................................................................................................................................|....................e.................. + // add v9.4s, v9.4s, v10.4s // ....................e...................................................................................................................................|...................e................... + // mul v10.4s, v24.4s, v1.4s // .............................e..........................................................................................................................|............................e.......... + // sqrdmulh v24.4s, v24.4s, v5.4s // ............................e...........................................................................................................................|...........................e........... + // mls v10.4s, v24.4s, v8.s[0] // ................................e.......................................................................................................................|...............................e....... + // sub v24.4s, v11.4s, v12.4s // .................e......................................................................................................................................|................e...................... + // add v11.4s, v11.4s, v12.4s // ..................e.....................................................................................................................................|.................e..................... + // mul v12.4s, v24.4s, v2.4s // ......................e.................................................................................................................................|.....................e................. + // sqrdmulh v24.4s, v24.4s, v6.4s // ........................e...............................................................................................................................|.......................e............... + // mls v12.4s, v24.4s, v8.s[0] // ...............................e........................................................................................................................|..............................e........ + // sub v24.4s, v9.4s, v11.4s // ...........................e............................................................................................................................|..........................e............ + // add v9.4s, v9.4s, v11.4s // .........................e..............................................................................................................................|........................e.............. + // mul v11.4s, v24.4s, v0.4s // ....................................e...................................................................................................................|...................................e... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..................................e.....................................................................................................................|.................................e..... + // mls v11.4s, v24.4s, v8.s[0] // ...........................................e............................................................................................................|....................................... + // sub v24.4s, v10.4s, v12.4s // ...................................e....................................................................................................................|..................................e.... + // add v10.4s, v10.4s, v12.4s // .....................................e..................................................................................................................|....................................e.. + // mul v12.4s, v24.4s, v0.4s // .............................................e..........................................................................................................|....................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ........................................e...............................................................................................................|....................................... + // mls v12.4s, v24.4s, v8.s[0] // ...............................................e........................................................................................................|....................................... + // ldr q0, [x5, #(-12*16 + 6*16)] // ..e.....................................................................................................................................................|.e..................................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ...................................................e....................................................................................................|....................................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ..........e.............................................................................................................................................|.........e............................. + // ldr q5, [x5, #(-12*16 + 9*16)] // ........e...............................................................................................................................................|.......e............................... + // ldr q2, [x5, #(-12*16 + 10*16)] // ..................................................e.....................................................................................................|....................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ...e....................................................................................................................................................|..e.................................... + // sub v24.4s, v13.4s, v14.4s // ..............................................e.........................................................................................................|....................................... + // add v13.4s, v13.4s, v14.4s // .................................................e......................................................................................................|....................................... + // mul v14.4s, v24.4s, v1.4s // ....................................................e...................................................................................................|....................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .......................................................e................................................................................................|....................................... + // mls v14.4s, v24.4s, v8.s[0] // ...............................................................e........................................................................................|....................................... + // sub v24.4s, v15.4s, v16.4s // ......................................................e.................................................................................................|....................................... + // add v15.4s, v15.4s, v16.4s // ................................................e.......................................................................................................|....................................... + // mul v16.4s, v24.4s, v2.4s // ..........................................................................e.............................................................................|....................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ...........................................................e............................................................................................|....................................... + // mls v16.4s, v24.4s, v8.s[0] // ............................................................................e...........................................................................|....................................... + // sub v24.4s, v13.4s, v15.4s // ........................................................e...............................................................................................|....................................... + // add v13.4s, v13.4s, v15.4s // ..........................................................e.............................................................................................|....................................... + // mul v15.4s, v24.4s, v0.4s // ..................................................................e.....................................................................................|....................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // .............................................................e..........................................................................................|....................................... + // mls v15.4s, v24.4s, v8.s[0] // ......................................................................e.................................................................................|....................................... + // sub v24.4s, v14.4s, v16.4s // ...................................................................................e....................................................................|....................................... + // add v14.4s, v14.4s, v16.4s // .......................................................................................e................................................................|....................................... + // mul v16.4s, v24.4s, v0.4s // ...........................................................................................e............................................................|....................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ......................................................................................e.................................................................|....................................... + // mls v16.4s, v24.4s, v8.s[0] // ............................................................................................e...........................................................|....................................... + // trn1 v25.4s, v9.4s, v10.4s // ............................................e...........................................................................................................|....................................... + // trn2 v26.4s, v9.4s, v10.4s // .........................................e..............................................................................................................|....................................... + // trn1 v27.4s, v11.4s, v12.4s // .........................................................e..............................................................................................|....................................... + // trn2 v28.4s, v11.4s, v12.4s // ............................................................e...........................................................................................|....................................... + // trn2 v11.2d, v25.2d, v27.2d // .....................................................................e..................................................................................|....................................... + // trn2 v12.2d, v26.2d, v28.2d // ...................................................................e....................................................................................|....................................... + // trn1 v9.2d, v25.2d, v27.2d // ..............................................................e.........................................................................................|....................................... + // trn1 v10.2d, v26.2d, v28.2d // ................................................................e.......................................................................................|....................................... + // trn1 v25.4s, v13.4s, v14.4s // ..........................................................................................e.............................................................|....................................... + // trn2 v26.4s, v13.4s, v14.4s // ...............................................................................................e........................................................|....................................... + // trn1 v27.4s, v15.4s, v16.4s // ..................................................................................................e.....................................................|....................................... + // trn2 v28.4s, v15.4s, v16.4s // ....................................................................................................e...................................................|....................................... + // trn2 v15.2d, v25.2d, v27.2d // ......................................................................................................e.................................................|....................................... + // trn2 v16.2d, v26.2d, v28.2d // ........................................................................................................e...............................................|....................................... + // trn1 v13.2d, v25.2d, v27.2d // .........................................................................................................e..............................................|....................................... + // trn1 v14.2d, v26.2d, v28.2d // ...........................................................................................................e............................................|....................................... + // ldr q0, [x4], #64 // ..............................................................................e.........................................................................|....................................... + // ldr q1, [x4, #(-64 + 16)] // .................................................................e......................................................................................|....................................... + // ldr q2, [x4, #(-64 + 32)] // .....................................................e..................................................................................................|....................................... + // ldr q3, [x4, #(-64 + 48)] // e.......................................................................................................................................................e....................................... + // sub v24.4s, v9.4s, v10.4s // ....................................................................e...................................................................................|....................................... + // add v9.4s, v9.4s, v10.4s // .......................................................................e................................................................................|....................................... + // mul v10.4s, v24.4s, v1.s[2] // .................................................................................e......................................................................|....................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ........................................................................e...............................................................................|....................................... + // mls v10.4s, v24.4s, v8.s[0] // .........................................................................................e..............................................................|....................................... + // sub v24.4s, v11.4s, v12.4s // ...........................................................................e............................................................................|....................................... + // add v11.4s, v11.4s, v12.4s // .........................................................................e..............................................................................|....................................... + // mul v12.4s, v24.4s, v2.s[0] // ....................................................................................e...................................................................|....................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................................................................................e.......................................................................|....................................... + // mls v12.4s, v24.4s, v8.s[0] // .....................................................................................e..................................................................|....................................... + // sub v24.4s, v13.4s, v14.4s // ................................................................................................................e.......................................|....................................... + // add v13.4s, v13.4s, v14.4s // .................................................................................................................e......................................|....................................... + // mul v14.4s, v24.4s, v2.s[2] // ........................................................................................................................e...............................|....................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...................................................................................................................e....................................|....................................... + // mls v14.4s, v24.4s, v8.s[0] // .........................................................................................................................e..............................|....................................... + // sub v24.4s, v15.4s, v16.4s // .............................................................................................................e..........................................|....................................... + // add v15.4s, v15.4s, v16.4s // ..............................................................................................................e.........................................|....................................... + // mul v16.4s, v24.4s, v3.s[0] // ..................................................................................................................e.....................................|....................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ......................................................................................................................e.................................|....................................... + // mls v16.4s, v24.4s, v8.s[0] // ...........................................................................................................................e............................|....................................... + // sub v24.4s, v9.4s, v11.4s // .............................................................................e..........................................................................|....................................... + // add v9.4s, v9.4s, v11.4s // ...............................................................................e........................................................................|....................................... + // mul v11.4s, v24.4s, v0.s[2] // .................................................................................................e......................................................|....................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ........................................................................................e...............................................................|....................................... + // mls v11.4s, v24.4s, v8.s[0] // ...................................................................................................e....................................................|....................................... + // sub v24.4s, v10.4s, v12.4s // .............................................................................................e..........................................................|....................................... + // add v10.4s, v10.4s, v12.4s // ................................................................................................e.......................................................|....................................... + // mul v12.4s, v24.4s, v0.s[2] // ..........................................................................................................e.............................................|....................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................e................................................|....................................... + // mls v12.4s, v24.4s, v8.s[0] // ...............................................................................................................e........................................|....................................... + // sub v24.4s, v13.4s, v15.4s // .....................................................................................................................e..................................|....................................... + // add v13.4s, v13.4s, v15.4s // ....................................................................................................................e...................................|....................................... + // mul v15.4s, v24.4s, v1.s[0] // ............................................................................................................................e...........................|....................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................................................................................................................e.............................|....................................... + // mls v15.4s, v24.4s, v8.s[0] // ...............................................................................................................................e........................|....................................... + // sub v24.4s, v14.4s, v16.4s // ..............................................................................................................................e.........................|....................................... + // add v14.4s, v14.4s, v16.4s // ................................................................................................................................e.......................|....................................... + // mul v16.4s, v24.4s, v1.s[0] // .................................................................................................................................e......................|....................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................................................e....................|....................................... + // mls v16.4s, v24.4s, v8.s[0] // ...........................................................................................................................................e............|....................................... + // srshr v24.4S, v9.4S, #23 // ..................................................................................e.....................................................................|....................................... + // mls v9.4s, v24.4s, v8.4s // ..............................................................................................e.........................................................|....................................... + // srshr v24.4S, v10.4S, #23 // .....................................................................................................e..................................................|....................................... + // mls v10.4s, v24.4s, v8.4s // ............................................................................................................e...........................................|....................................... + // srshr v24.4S, v13.4S, #23 // .......................................................................................................................e................................|....................................... + // mls v13.4s, v24.4s, v8.4s // .............................................................................................................................e..........................|....................................... + // srshr v24.4S, v14.4S, #23 // .....................................................................................................................................e..................|....................................... + // mls v14.4s, v24.4s, v8.4s // ..........................................................................................................................................e.............|....................................... + // sub v24.4s, v9.4s, v13.4s // ..................................................................................................................................e.....................|....................................... + // add v9.4s, v9.4s, v13.4s // ....................................................................................................................................e...................|....................................... + // mul v13.4s, v24.4s, v0.s[0] // ............................................................................................................................................e...........|....................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................................................................................e.................|....................................... + // mls v13.4s, v24.4s, v8.s[0] // ...............................................................................................................................................e........|....................................... + // sub v24.4s, v10.4s, v14.4s // .............................................................................................................................................e..........|....................................... + // add v10.4s, v10.4s, v14.4s // ..............................................................................................................................................e.........|....................................... + // mul v14.4s, v24.4s, v0.s[0] // .............*..........................................................................................................................................|............*.......................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ....................................................................................................................................................e...|....................................... + // mls v14.4s, v24.4s, v8.s[0] // ...................*....................................................................................................................................|..................*.................... + // sub v24.4s, v11.4s, v15.4s // .......................................................................................................................................e................|....................................... + // add v11.4s, v11.4s, v15.4s // .........................................................................................................................................e..............|....................................... + // mul v15.4s, v24.4s, v0.s[0] // ..................................................................................................................................................e.....|....................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........*..............................................................................................................................................|........*.............................. + // mls v15.4s, v24.4s, v8.s[0] // .................................*......................................................................................................................|................................*...... + // sub v24.4s, v12.4s, v16.4s // ................................................................................................................................................e.......|....................................... + // add v12.4s, v12.4s, v16.4s // .................................................................................................................................................e......|....................................... + // mul v16.4s, v24.4s, v0.s[0] // ....*...................................................................................................................................................|...*................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................e|....................................... + // mls v16.4s, v24.4s, v8.s[0] // ................*.......................................................................................................................................|...............*....................... + // str q9, [x1], #(16*4) // ........................................................................................................................................e...............|....................................... + // str q10, [x1, #(-16*4 + 1*16)] // ...................................................................................................................................................e....|....................................... + // str q11, [x1, #(-16*4 + 2*16)] // ......*.................................................................................................................................................|.....*................................. + // str q12, [x1, #(-16*4 + 3*16)] // ......................................................................................................................................................e.|....................................... + // str q13, [x2], #(16*4) // .....................................................................................................................................................e..|....................................... + // str q14, [x2, #(-16*4 + 1*16)] // ..........................*.............................................................................................................................|.........................*............. + // str q15, [x2, #(-16*4 + 2*16)] // ......................................*.................................................................................................................|.....................................*. + // str q16, [x2, #(-16*4 + 3*16)] // .......................*................................................................................................................................|......................*................ + // add x1, x1, #64 // .......*................................................................................................................................................|......*................................ + // add x2, x2, #64 // .......................................*................................................................................................................|......................................* sub count, count, #1 cbnz count, layer45678_start - sub v30.4S, v3.4S, v28.4S // *.......... - add x1, x1, #64 // .*......... - mul v2.4S, v21.4S, v18.S[0] // ..*........ - str q31, [x2, #-32] // ...*....... - // gap // ........... - // gap // ........... - // gap // ........... - // gap // ........... - // gap // ........... - sqrdmulh v29.4S, v30.4S, v18.S[1] // .....*..... - // gap // ........... - // gap // ........... - // gap // ........... - // gap // ........... - // gap // ........... - mul v30.4S, v30.4S, v18.S[0] // ....*...... - // gap // ........... - // gap // ........... - // gap // ........... - // gap // ........... - // gap // ........... - mls v2.4S, v9.4S, v8.S[0] // ......*.... - // gap // ........... - // gap // ........... - // gap // ........... - // gap // ........... - // gap // ........... - mls v30.4S, v29.4S, v8.S[0] // ........*.. - // gap // ........... - // gap // ........... - // gap // ........... - // gap // ........... - // gap // ........... - // gap // ........... - // gap // ........... - // gap // ........... - str q2, [x2, #-16] // .......*... - // gap // ........... - // gap // ........... - // gap // ........... - // gap // ........... - // gap // ........... - str q30, [x2, #-48] // .........*. - add x2, x2, #64 // ..........* - // gap // ........... + mul v16.4S, v31.4S, v20.S[0] // *........... + str q9, [x1, #-32] // .*.......... + add x1, x1, #64 // ..*......... + // gap // ............ + // gap // ............ + // gap // ............ + sqrdmulh v2.4S, v19.4S, v20.S[1] // ...*........ + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + mul v22.4S, v15.4S, v20.S[0] // ....*....... + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + mls v16.4S, v4.4S, v8.S[0] // .....*...... + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + mls v22.4S, v11.4S, v8.S[0] // ......*..... + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + mls v23.4S, v2.4S, v8.S[0] // .........*.. + // gap // ............ + // gap // ............ + str q16, [x2, #-16] // .......*.... + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + str q22, [x2, #-48] // ........*... + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + str q23, [x2, #-32] // ..........*. + add x2, x2, #64 // ...........* + // gap // ............ // original source code - // sub v17.4S, v3.4S, v28.4S // *.......... - // add x1, x1, #64 // .*......... - // mul v21.4S, v21.4S, v18.S[0] // ..*........ - // str q31, [x2, #-32] // ...*....... - // mul v31.4S, v17.4S, v18.S[0] // .....*..... - // sqrdmulh v17.4S, v17.4S, v18.S[1] // ....*...... - // mls v21.4S, v9.4S, v8.S[0] // ......*.... - // str q21, [x2, #-16] // ........*.. - // mls v31.4S, v17.4S, v8.S[0] // .......*... - // str q31, [x2, #-48] // .........*. - // add x2, x2, #64 // ..........* + // mul v30.4S, v31.4S, v20.S[0] // *........... + // str q9, [x1, #-32] // .*.......... + // add x1, x1, #64 // ..*......... + // sqrdmulh v0.4S, v19.4S, v20.S[1] // ...*........ + // mul v1.4S, v15.4S, v20.S[0] // ....*....... + // mls v30.4S, v4.4S, v8.S[0] // .....*...... + // mls v1.4S, v11.4S, v8.S[0] // ......*..... + // str q30, [x2, #-16] // ........*... + // str q1, [x2, #-48] // .........*.. + // mls v23.4S, v0.4S, v8.S[0] // .......*.... + // str q23, [x2, #-32] // ..........*. + // add x2, x2, #64 // ...........* // ----------------------------------------------------------------------------- ninv .req v25 ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 ASM_LOAD(xtmp, ninv_addr) ld1r {ninv.4s}, [xtmp] ASM_LOAD(xtmp, ninv_tw_addr) ld1r {ninv_tw.4s}, [xtmp] + ushr modulus_half.4S, consts.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + mov count, #8 ASM_LOAD(r_ptr0, roots_l012) load_roots_123 .p2align 2 - // gap // ........ - ldr q13, [x0, #768] // ..*..... - ldr q30, [x0, #896] // ...*.... - ldr q15, [x0, #512] // *....... - ldr q29, [x0, #640] // .*...... - // gap // ........ - // gap // ........ - // gap // ........ - // gap // ........ - // gap // ........ - // gap // ........ - // gap // ........ - // gap // ........ - // gap // ........ - sub v12.4S, v13.4S, v30.4S // .....*.. - sub v24.4S, v15.4S, v29.4S // ....*... - // gap // ........ - // gap // ........ - // gap // ........ - // gap // ........ - // gap // ........ - sqrdmulh v7.4S, v12.4S, v3.S[1] // .......* - // gap // ........ - // gap // ........ - // gap // ........ - // gap // ........ - // gap // ........ - sqrdmulh v31.4S, v24.4S, v2.S[3] // ......*. - // gap // ........ - // gap // ........ + ldr q4, [x0, #768] // *............ + ldr q20, [x0, #896] // .....*....... + // gap // ............. + ldr q12, [x0, #256] // .*........... + // gap // ............. + // gap // ............. + ldr q11, [x0, #384] // ..*.......... + // gap // ............. + // gap // ............. + ldr q28, [x0, #512] // ...*......... + // gap // ............. + // gap // ............. + sub v15.4S, v4.4S, v20.4S // ......*...... + ldr q24, [x0, #640] // ....*........ + // gap // ............. + add v18.4S, v4.4S, v20.4S // .......*..... + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + sqrdmulh v9.4S, v15.4S, v3.S[1] // .........*... + // gap // ............. + // gap // ............. + sub v29.4S, v28.4S, v24.4S // ........*.... + // gap // ............. + // gap // ............. + mul v5.4S, v15.4S, v3.S[0] // ..........*.. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + sqrdmulh v20.4S, v29.4S, v2.S[3] // ...........*. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + mls v5.4S, v9.4S, v8.S[0] // ............* + // gap // ............. + // gap // ............. // original source code - // ldr q15, [x0, #512] // ..*..... - // ldr q29, [x0, #640] // ...*.... - // ldr q13, [x0, #768] // *....... - // ldr q30, [x0, #896] // .*...... - // sub v24.4S, v15.4S, v29.4S // .....*.. - // sub v12.4S, v13.4S, v30.4S // ....*... - // sqrdmulh v31.4S, v24.4S, v2.S[3] // .......* - // sqrdmulh v7.4S, v12.4S, v3.S[1] // ......*. + // ldr q13, [x0, #768] // *............ + // ldr q12, [x0, #256] // ..*.......... + // ldr q11, [x0, #384] // ...*......... + // ldr q28, [x0, #512] // ....*........ + // ldr q24, [x0, #640] // ......*...... + // ldr q14, [x0, #896] // .*........... + // sub v6.4S, v13.4S, v14.4S // .....*....... + // add v18.4S, v13.4S, v14.4S // .......*..... + // sub v29.4S, v28.4S, v24.4S // .........*... + // sqrdmulh v19.4S, v6.4S, v3.S[1] // ........*.... + // mul v5.4S, v6.4S, v3.S[0] // ..........*.. + // sqrdmulh v20.4S, v29.4S, v2.S[3] // ...........*. + // mls v5.4S, v19.4S, v8.S[0] // ............* sub count, count, #1 layer123_start: - mul v17.4S, v24.4S, v2.S[2] // ....................*........................................................................... - ldr q21, [x0, #0] // *............................................................................................... - ldr q14, [x0, #128] // .*.............................................................................................. - ldr q24, [x0, #256] // ..*............................................................................................. - ldr q28, [x0, #384] // ...*............................................................................................ - add v6.4S, v15.4S, v29.4S // ...................*............................................................................ - mls v17.4S, v31.4S, v8.S[0] // ......................*......................................................................... - add v31.4S, v13.4S, v30.4S // ........................*....................................................................... - ldr q15, [x0, #528] // ....e........................................................................................... - ldr q29, [x0, #656] // .....e.......................................................................................... - ldr q13, [x0, #784] // ......e......................................................................................... - // gap // ................................................................................................ - mul v16.4S, v12.4S, v3.S[0] // .........................*...................................................................... - sub v9.4S, v21.4S, v14.4S // ........*....................................................................................... - ldr q30, [x0, #912] // .......e........................................................................................ - add v21.4S, v21.4S, v14.4S // .........*...................................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v16.4S, v7.4S, v8.S[0] // ...........................*.................................................................... - sub v14.4S, v24.4S, v28.4S // .............*.................................................................................. - // gap // ................................................................................................ - add v24.4S, v24.4S, v28.4S // ..............*................................................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v28.4S, v9.4S, v1.S[2] // ..........*..................................................................................... - sub v7.4S, v6.4S, v31.4S // ......................................*......................................................... - // gap // ................................................................................................ - add v6.4S, v6.4S, v31.4S // .......................................*........................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v31.4S, v9.4S, v1.S[3] // ...........*.................................................................................... - sub v9.4S, v21.4S, v24.4S // ............................*................................................................... - // gap // ................................................................................................ - sub v18.4S, v17.4S, v16.4S // ...........................................*.................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - add v17.4S, v17.4S, v16.4S // ............................................*................................................... - mul v16.4S, v14.4S, v2.S[0] // ...............*................................................................................ - // gap // ................................................................................................ - add v21.4S, v21.4S, v24.4S // .............................*.................................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v14.4S, v14.4S, v2.S[1] // ................*............................................................................... - sub v24.4S, v15.4S, v29.4S // ..................e............................................................................. - // gap // ................................................................................................ - sub v12.4S, v13.4S, v30.4S // .......................e........................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v28.4S, v31.4S, v8.S[0] // ............*................................................................................... - sub v31.4S, v21.4S, v6.4S // ................................................*............................................... - // gap // ................................................................................................ - add v21.4S, v21.4S, v6.4S // .................................................*.............................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v6.4S, v9.4S, v0.S[2] // ..............................*................................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v16.4S, v14.4S, v8.S[0] // .................*.............................................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v14.4S, v9.4S, v0.S[3] // ...............................*................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v9.4S, v7.4S, v1.S[0] // ........................................*....................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - sub v23.4S, v28.4S, v16.4S // .................................*.............................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v7.4S, v7.4S, v1.S[1] // .........................................*...................................................... - add v28.4S, v28.4S, v16.4S // ..................................*............................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v6.4S, v14.4S, v8.S[0] // ................................*............................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - sub v14.4S, v28.4S, v17.4S // .....................................................*.......................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - add v17.4S, v28.4S, v17.4S // ......................................................*......................................... - mul v28.4S, v23.4S, v0.S[2] // ...................................*............................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v16.4S, v23.4S, v0.S[3] // ....................................*........................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v23.4S, v18.4S, v1.S[0] // .............................................*.................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v18.4S, v18.4S, v1.S[1] // ..............................................*................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v28.4S, v16.4S, v8.S[0] // .....................................*.......................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v9.4S, v7.4S, v8.S[0] // ..........................................*..................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v23.4S, v18.4S, v8.S[0] // ...............................................*................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v16.4S, v31.4S, v0.S[0] // ..................................................*............................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - sub v7.4S, v6.4S, v9.4S // ..........................................................*..................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v31.4S, v31.4S, v0.S[1] // ...................................................*............................................ - add v6.4S, v6.4S, v9.4S // ...........................................................*.................................... - // gap // ................................................................................................ - sub v9.4S, v28.4S, v23.4S // ...............................................................*................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - add v28.4S, v28.4S, v23.4S // ................................................................*............................... - mul v18.4S, v14.4S, v0.S[0] // .......................................................*........................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v14.4S, v14.4S, v0.S[1] // ........................................................*....................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v23.4S, v21.4S, v25.4S // ................................................................................*............... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v21.4S, v21.4S, v26.4S // .................................................................................*.............. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v16.4S, v31.4S, v8.S[0] // ....................................................*........................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v18.4S, v14.4S, v8.S[0] // .........................................................*...................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v14.4S, v7.4S, v0.S[0] // ............................................................*................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - srshr v31.4S, v16.4S, #23 // ....................................................................*........................... - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v7.4S, v7.4S, v0.S[1] // .............................................................*.................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - srshr v10.4S, v18.4S, #23 // ......................................................................*......................... - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v4.4S, v9.4S, v0.S[0] // .................................................................*.............................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v9.4S, v9.4S, v0.S[1] // ..................................................................*............................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v14.4S, v7.4S, v8.S[0] // ..............................................................*................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v16.4S, v31.4S, v8.4S // .....................................................................*.......................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v4.4S, v9.4S, v8.S[0] // ...................................................................*............................ - // gap // ................................................................................................ - // gap // ................................................................................................ - srshr v31.4S, v14.4S, #23 // ........................................................................*....................... - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v18.4S, v10.4S, v8.4S // .......................................................................*........................ - // gap // ................................................................................................ - // gap // ................................................................................................ - str q16, [x0, #512] // ............................................................................*................... - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v14.4S, v31.4S, v8.4S // .........................................................................*...................... - // gap // ................................................................................................ - // gap // ................................................................................................ - srshr v31.4S, v4.4S, #23 // ..........................................................................*..................... - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v23.4S, v21.4S, v8.S[0] // ..................................................................................*............. - // gap // ................................................................................................ - // gap // ................................................................................................ - str q18, [x0, #640] // .............................................................................*.................. - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v4.4S, v31.4S, v8.4S // ...........................................................................*.................... - // gap // ................................................................................................ - // gap // ................................................................................................ - str q14, [x0, #768] // ..............................................................................*................. - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v21.4S, v17.4S, v25.4S // ...................................................................................*............ - // gap // ................................................................................................ - // gap // ................................................................................................ - str q23, [x0], #(16) // ............................................................................................*... - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v17.4S, v17.4S, v26.4S // ....................................................................................*........... - // gap // ................................................................................................ - // gap // ................................................................................................ - str q4, [x0, #880] // ...............................................................................*................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v14.4S, v6.4S, v25.4S // ......................................................................................*......... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v6.4S, v6.4S, v26.4S // .......................................................................................*........ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v21.4S, v17.4S, v8.S[0] // .....................................................................................*.......... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v17.4S, v28.4S, v26.4S // ..........................................................................................*..... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v14.4S, v6.4S, v8.S[0] // ........................................................................................*....... - // gap // ................................................................................................ - // gap // ................................................................................................ - str q21, [x0, #112] // .............................................................................................*.. - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v21.4S, v28.4S, v25.4S // .........................................................................................*...... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v21.4S, v17.4S, v8.S[0] // ...........................................................................................*.... - // gap // ................................................................................................ - // gap // ................................................................................................ - str q14, [x0, #240] // ..............................................................................................*. - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v31.4S, v24.4S, v2.S[3] // .....................e.......................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v7.4S, v12.4S, v3.S[1] // ..........................e..................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - str q21, [x0, #368] // ...............................................................................................* - // gap // ................................................................................................ - // gap // ................................................................................................ + ldr q16, [x0, #0] // *....................................................................................................................... + ldr q4, [x0, #128] // .*...................................................................................................................... + sub v7.4S, v12.4S, v11.4S // .............*.......................................................................................................... + mul v19.4S, v29.4S, v2.S[2] // ....................*................................................................................................... + add v22.4S, v12.4S, v11.4S // ..............*......................................................................................................... + ldr q13, [x0, #784] // ......e................................................................................................................. + add v21.4S, v28.4S, v24.4S // ...................*.................................................................................................... + ldr q12, [x0, #272] // ..e..................................................................................................................... + ldr q11, [x0, #400] // ...e.................................................................................................................... + mul v17.4S, v7.4S, v2.S[0] // ...............*........................................................................................................ + ldr q28, [x0, #528] // ....e................................................................................................................... + ldr q24, [x0, #656] // .....e.................................................................................................................. + ldr q14, [x0, #912] // .......e................................................................................................................ + sub v29.4S, v16.4S, v4.4S // ........*............................................................................................................... + // gap // ........................................................................................................................ + add v16.4S, v16.4S, v4.4S // .........*.............................................................................................................. + sqrdmulh v4.4S, v7.4S, v2.S[1] // ................*....................................................................................................... + // gap // ........................................................................................................................ + sub v7.4S, v21.4S, v18.4S // ......................................*................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v19.4S, v20.4S, v8.S[0] // ......................*................................................................................................. + add v21.4S, v21.4S, v18.4S // .......................................*................................................................................ + // gap // ........................................................................................................................ + sub v20.4S, v16.4S, v22.4S // ............................*........................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v16.4S, v16.4S, v22.4S // .............................*.......................................................................................... + mul v22.4S, v29.4S, v1.S[2] // ..........*............................................................................................................. + // gap // ........................................................................................................................ + sub v6.4S, v13.4S, v14.4S // .......................e................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v18.4S, v13.4S, v14.4S // ........................e............................................................................................... + mls v17.4S, v4.4S, v8.S[0] // .................*...................................................................................................... + // gap // ........................................................................................................................ + sub v4.4S, v19.4S, v5.4S // ...........................................*............................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v13.4S, v29.4S, v1.S[3] // ...........*............................................................................................................ + add v19.4S, v19.4S, v5.4S // ............................................*........................................................................... + // gap // ........................................................................................................................ + sub v14.4S, v16.4S, v21.4S // ................................................*....................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v16.4S, v16.4S, v21.4S // .................................................*...................................................................... + mul v21.4S, v20.4S, v0.S[2] // ..............................*......................................................................................... + // gap // ........................................................................................................................ + sub v29.4S, v28.4S, v24.4S // ..................e..................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v20.4S, v20.4S, v0.S[3] // ...............................*........................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v5.4S, v7.4S, v1.S[0] // ........................................*............................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v7.4S, v7.4S, v1.S[1] // .........................................*.............................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v22.4S, v13.4S, v8.S[0] // ............*........................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v21.4S, v20.4S, v8.S[0] // ................................*....................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v5.4S, v7.4S, v8.S[0] // ..........................................*............................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v7.4S, v22.4S, v17.4S // .................................*...................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v22.4S, v22.4S, v17.4S // ..................................*..................................................................................... + mul v13.4S, v4.4S, v1.S[0] // .............................................*.......................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v17.4S, v7.4S, v0.S[2] // ...................................*.................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v20.4S, v22.4S, v19.4S // .....................................................*.................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v22.4S, v22.4S, v19.4S // ......................................................*................................................................. + sqrdmulh v7.4S, v7.4S, v0.S[3] // ....................................*................................................................................... + // gap // ........................................................................................................................ + sub v19.4S, v21.4S, v5.4S // ..........................................................*............................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v4.4S, v4.4S, v1.S[1] // ..............................................*......................................................................... + add v21.4S, v21.4S, v5.4S // ...........................................................*............................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v5.4S, v14.4S, v0.S[0] // ..................................................*..................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v17.4S, v7.4S, v8.S[0] // .....................................*.................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v13.4S, v4.4S, v8.S[0] // ...............................................*........................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v4.4S, v14.4S, v0.S[1] // ...................................................*.................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v7.4S, v20.4S, v0.S[0] // .......................................................*................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v14.4S, v17.4S, v13.4S // ...............................................................*........................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v13.4S, v17.4S, v13.4S // ................................................................*....................................................... + mul v17.4S, v16.4S, v25.4S // ........................................................................................*............................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v16.4S, v16.4S, v26.4S // .........................................................................................*.............................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v5.4S, v4.4S, v8.S[0] // ....................................................*................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v4.4S, v20.4S, v0.S[1] // ........................................................*............................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v20.4S, v19.4S, v0.S[0] // ............................................................*........................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v10.4S, v31.4S, v5.4S // ....................................................................*................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v19.4S, v19.4S, v0.S[1] // .............................................................*.......................................................... + cmge v9.4S, v5.4S, v30.4S // .....................................................................*.................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v7.4S, v4.4S, v8.S[0] // .........................................................*.............................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v4.4S, v10.4S, v9.4S // ......................................................................*................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v10.4S, v14.4S, v0.S[0] // .................................................................*...................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v20.4S, v19.4S, v8.S[0] // ..............................................................*......................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v19.4S, v31.4S, v7.4S // ........................................................................*............................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v14.4S, v14.4S, v0.S[1] // ..................................................................*..................................................... + cmge v9.4S, v7.4S, v30.4S // .........................................................................*.............................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v5.4S, v4.4S, v8.4S // .......................................................................*................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v4.4S, v19.4S, v9.4S // ..........................................................................*............................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v17.4S, v16.4S, v8.S[0] // ..........................................................................................*............................. + cmge v16.4S, v31.4S, v20.4S // ............................................................................*........................................... + // gap // ........................................................................................................................ + cmge v19.4S, v20.4S, v30.4S // .............................................................................*.......................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v10.4S, v14.4S, v8.S[0] // ...................................................................*.................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q5, [x0, #512] // ....................................................................................*................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v7.4S, v4.4S, v8.4S // ...........................................................................*............................................ + sub v16.4S, v16.4S, v19.4S // ..............................................................................*......................................... + // gap // ........................................................................................................................ + cmge v4.4S, v31.4S, v17.4S // ....................................................................................................*................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v19.4S, v17.4S, v30.4S // .....................................................................................................*.................. + mul v14.4S, v22.4S, v25.4S // ...........................................................................................*............................ + // gap // ........................................................................................................................ + cmge v5.4S, v31.4S, v10.4S // ................................................................................*....................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v20.4S, v16.4S, v8.4S // ...............................................................................*........................................ + cmge v16.4S, v10.4S, v30.4S // .................................................................................*...................................... + // gap // ........................................................................................................................ + str q7, [x0, #640] // .....................................................................................*.................................. + sub v4.4S, v4.4S, v19.4S // ......................................................................................................*................. + // gap // ........................................................................................................................ + sqrdmulh v7.4S, v22.4S, v26.4S // ............................................................................................*........................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v16.4S, v5.4S, v16.4S // ..................................................................................*..................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v22.4S, v21.4S, v25.4S // ..............................................................................................*......................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q20, [x0, #768] // ......................................................................................*................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v19.4S, v21.4S, v26.4S // ...............................................................................................*........................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v14.4S, v7.4S, v8.S[0] // .............................................................................................*.......................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v7.4S, v13.4S, v26.4S // ..................................................................................................*..................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v22.4S, v19.4S, v8.S[0] // ................................................................................................*....................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v19.4S, v31.4S, v14.4S // ........................................................................................................*............... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v21.4S, v13.4S, v25.4S // .................................................................................................*...................... + cmge v13.4S, v14.4S, v30.4S // .........................................................................................................*.............. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v21.4S, v7.4S, v8.S[0] // ...................................................................................................*.................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v7.4S, v19.4S, v13.4S // ..........................................................................................................*............. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v10.4S, v16.4S, v8.4S // ...................................................................................*.................................... + cmge v16.4S, v31.4S, v22.4S // ............................................................................................................*........... + // gap // ........................................................................................................................ + cmge v19.4S, v22.4S, v30.4S // .............................................................................................................*.......... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v17.4S, v4.4S, v8.4S // .......................................................................................................*................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v4.4S, v31.4S, v21.4S // ................................................................................................................*....... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v14.4S, v7.4S, v8.4S // ...........................................................................................................*............ + cmge v7.4S, v21.4S, v30.4S // .................................................................................................................*...... + // gap // ........................................................................................................................ + str q10, [x0, #896] // .......................................................................................*................................ + sub v16.4S, v16.4S, v19.4S // ..............................................................................................................*......... + // gap // ........................................................................................................................ + sqrdmulh v19.4S, v6.4S, v3.S[1] // ..........................e............................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q17, [x0], #(16) // ....................................................................................................................*... + sub v4.4S, v4.4S, v7.4S // ..................................................................................................................*..... + // gap // ........................................................................................................................ + mls v22.4S, v16.4S, v8.4S // ...............................................................................................................*........ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q14, [x0, #112] // .....................................................................................................................*.. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v21.4S, v4.4S, v8.4S // ...................................................................................................................*.... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v5.4S, v6.4S, v3.S[0] // .........................e.............................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q22, [x0, #240] // ......................................................................................................................*. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v20.4S, v29.4S, v2.S[3] // .....................e.................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q21, [x0, #368] // .......................................................................................................................* + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v5.4S, v19.4S, v8.S[0] // ...........................e............................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ // original source code - // ldr q9, [x0, #0] // ........................................................................................|*.............................................................................................. - // ldr q10, [x0, #(1*(1024/8))] // ........................................................................................|.*............................................................................................. - // ldr q11, [x0, #(2*(1024/8))] // ........................................................................................|..*............................................................................................ - // ldr q12, [x0, #(3*(1024/8))] // ........................................................................................|...*........................................................................................... - // ldr q13, [x0, #(4*(1024/8))] // e.......................................................................................|.......e....................................................................................... - // ldr q14, [x0, #(5*(1024/8))] // .e......................................................................................|........e...................................................................................... - // ldr q15, [x0, #(6*(1024/8))] // ..e.....................................................................................|.........e..................................................................................... - // ldr q16, [x0, #(7*(1024/8))] // .....e..................................................................................|............e.................................................................................. - // sub v24.4s, v9.4s, v10.4s // ....*...................................................................................|...........*................................................................................... - // add v9.4s, v9.4s, v10.4s // ......*.................................................................................|.............*................................................................................. - // mul v10.4s, v24.4s, v1.s[2] // ..........*.............................................................................|.................*............................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[3] // .............*..........................................................................|....................*.......................................................................... - // mls v10.4s, v24.4s, v8.s[0] // ......................*.................................................................|.............................*................................................................. - // sub v24.4s, v11.4s, v12.4s // ........*...............................................................................|...............*............................................................................... - // add v11.4s, v11.4s, v12.4s // .........*..............................................................................|................*.............................................................................. - // mul v12.4s, v24.4s, v2.s[0] // .................*......................................................................|........................*...................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...................*....................................................................|..........................*.................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ..........................*.............................................................|.................................*............................................................. - // sub v24.4s, v13.4s, v14.4s // ....................e...................................................................|...........................e................................................................... - // add v13.4s, v13.4s, v14.4s // ........................................................................................|....*.......................................................................................... - // mul v14.4s, v24.4s, v2.s[2] // ........................................................................................*............................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // .....................................................................................e..|............................................................................................e.. - // mls v14.4s, v24.4s, v8.s[0] // ........................................................................................|.....*......................................................................................... - // sub v24.4s, v15.4s, v16.4s // .....................e..................................................................|............................e.................................................................. - // add v15.4s, v15.4s, v16.4s // ........................................................................................|......*........................................................................................ - // mul v16.4s, v24.4s, v3.s[0] // ...*....................................................................................|..........*.................................................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ......................................................................................e.|.............................................................................................e. - // mls v16.4s, v24.4s, v8.s[0] // .......*................................................................................|..............*................................................................................ - // sub v24.4s, v9.4s, v11.4s // ..............*.........................................................................|.....................*......................................................................... - // add v9.4s, v9.4s, v11.4s // ..................*.....................................................................|.........................*..................................................................... - // mul v11.4s, v24.4s, v0.s[2] // .........................*..............................................................|................................*.............................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................*............................................................|..................................*............................................................ - // mls v11.4s, v24.4s, v8.s[0] // ................................*.......................................................|.......................................*....................................................... - // sub v24.4s, v10.4s, v12.4s // .............................*..........................................................|....................................*.......................................................... - // add v10.4s, v10.4s, v12.4s // ...............................*........................................................|......................................*........................................................ - // mul v12.4s, v24.4s, v0.s[2] // ...................................*....................................................|..........................................*.................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ....................................*...................................................|...........................................*................................................... - // mls v12.4s, v24.4s, v8.s[0] // .......................................*................................................|..............................................*................................................ - // sub v24.4s, v13.4s, v15.4s // ...........*............................................................................|..................*............................................................................ - // add v13.4s, v13.4s, v15.4s // ............*...........................................................................|...................*........................................................................... - // mul v15.4s, v24.4s, v1.s[0] // ............................*...........................................................|...................................*........................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................*.........................................................|.....................................*......................................................... - // mls v15.4s, v24.4s, v8.s[0] // ........................................*...............................................|...............................................*............................................... - // sub v24.4s, v14.4s, v16.4s // ...............*........................................................................|......................*........................................................................ - // add v14.4s, v14.4s, v16.4s // ................*.......................................................................|.......................*....................................................................... - // mul v16.4s, v24.4s, v1.s[0] // .....................................*..................................................|............................................*.................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................................*.................................................|.............................................*................................................. - // mls v16.4s, v24.4s, v8.s[0] // .........................................*..............................................|................................................*.............................................. - // sub v24.4s, v9.4s, v13.4s // .......................*................................................................|..............................*................................................................ - // add v9.4s, v9.4s, v13.4s // ........................*...............................................................|...............................*............................................................... - // mul v13.4s, v24.4s, v0.s[0] // ..........................................*.............................................|.................................................*............................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................*...........................................|...................................................*........................................... - // mls v13.4s, v24.4s, v8.s[0] // ....................................................*...................................|...........................................................*................................... - // sub v24.4s, v10.4s, v14.4s // .................................*......................................................|........................................*...................................................... - // add v10.4s, v10.4s, v14.4s // ..................................*.....................................................|.........................................*..................................................... - // mul v14.4s, v24.4s, v0.s[0] // ................................................*.......................................|.......................................................*....................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................................*......................................|........................................................*...................................... - // mls v14.4s, v24.4s, v8.s[0] // .....................................................*..................................|............................................................*.................................. - // sub v24.4s, v11.4s, v15.4s // ...........................................*............................................|..................................................*............................................ - // add v11.4s, v11.4s, v15.4s // .............................................*..........................................|....................................................*.......................................... - // mul v15.4s, v24.4s, v0.s[0] // ......................................................*.................................|.............................................................*................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................*...............................|...............................................................*............................... - // mls v15.4s, v24.4s, v8.s[0] // ............................................................*...........................|...................................................................*........................... - // sub v24.4s, v12.4s, v16.4s // ..............................................*.........................................|.....................................................*......................................... - // add v12.4s, v12.4s, v16.4s // ...............................................*........................................|......................................................*........................................ - // mul v16.4s, v24.4s, v0.s[0] // ..........................................................*.............................|.................................................................*............................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................*............................|..................................................................*............................ - // mls v16.4s, v24.4s, v8.s[0] // ..............................................................*.........................|.....................................................................*......................... - // srshr v24.4S, v13.4S, #23 // .......................................................*................................|..............................................................*................................ - // mls v13.4s, v24.4s, v8.4s // .............................................................*..........................|....................................................................*.......................... - // srshr v24.4S, v14.4S, #23 // .........................................................*..............................|................................................................*.............................. - // mls v14.4s, v24.4s, v8.4s // ................................................................*.......................|.......................................................................*....................... - // srshr v24.4S, v15.4S, #23 // ...............................................................*........................|......................................................................*........................ - // mls v15.4s, v24.4s, v8.4s // ..................................................................*.....................|.........................................................................*..................... - // srshr v24.4S, v16.4S, #23 // ...................................................................*....................|..........................................................................*.................... - // mls v16.4s, v24.4s, v8.4s // ......................................................................*.................|.............................................................................*................. - // str q13, [x0, #(4*(1024/8))] // .................................................................*......................|........................................................................*...................... - // str q14, [x0, #(5*(1024/8))] // .....................................................................*..................|............................................................................*.................. - // str q15, [x0, #(6*(1024/8))] // .......................................................................*................|..............................................................................*................ - // str q16, [x0, #(7*(1024/8))] // ...........................................................................*............|..................................................................................*............ - // mul v13.4s, v9.4s, v25.4s // ..................................................*.....................................|.........................................................*..................................... - // sqrdmulh v9.4s, v9.4s, v26.4s // ...................................................*....................................|..........................................................*.................................... - // mls v13.4s, v9.4s, v8.s[0] // ....................................................................*...................|...........................................................................*................... - // mul v14.4s, v10.4s, v25.4s // ........................................................................*...............|...............................................................................*............... - // sqrdmulh v10.4s, v10.4s, v26.4s // ..........................................................................*.............|.................................................................................*............. - // mls v14.4s, v10.4s, v8.s[0] // ..............................................................................*.........|.....................................................................................*......... - // mul v15.4s, v11.4s, v25.4s // ............................................................................*...........|...................................................................................*........... - // sqrdmulh v11.4s, v11.4s, v26.4s // .............................................................................*..........|....................................................................................*.......... - // mls v15.4s, v11.4s, v8.s[0] // ................................................................................*.......|.......................................................................................*....... - // mul v16.4s, v12.4s, v25.4s // ..................................................................................*.....|.........................................................................................*..... - // sqrdmulh v12.4s, v12.4s, v26.4s // ...............................................................................*........|......................................................................................*........ - // mls v16.4s, v12.4s, v8.s[0] // ...................................................................................*....|..........................................................................................*.... - // str q13, [x0], #(16) // .........................................................................*..............|................................................................................*.............. - // str q14, [x0, #(-16 + 1*(1024/8))] // .................................................................................*......|........................................................................................*...... - // str q15, [x0, #(-16 + 2*(1024/8))] // ....................................................................................*...|...........................................................................................*... - // str q16, [x0, #(-16 + 3*(1024/8))] // .......................................................................................*|..............................................................................................* + // ldr q9, [x0, #0] // ...................................................................................................................*...................................................................................................................... + // ldr q10, [x0, #(1*(1024/8))] // ...................................................................................................................|*..................................................................................................................... + // ldr q11, [x0, #(2*(1024/8))] // ..e................................................................................................................|......e............................................................................................................... + // ldr q12, [x0, #(3*(1024/8))] // ...e...............................................................................................................|.......e.............................................................................................................. + // ldr q13, [x0, #(4*(1024/8))] // .....e.............................................................................................................|.........e............................................................................................................ + // ldr q14, [x0, #(5*(1024/8))] // ......e............................................................................................................|..........e........................................................................................................... + // ldr q15, [x0, #(6*(1024/8))] // e..................................................................................................................|....e................................................................................................................. + // ldr q16, [x0, #(7*(1024/8))] // .......e...........................................................................................................|...........e.......................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ........*..........................................................................................................|............*......................................................................................................... + // add v9.4s, v9.4s, v10.4s // .........*.........................................................................................................|.............*........................................................................................................ + // mul v10.4s, v24.4s, v1.s[2] // ................*..................................................................................................|....................*................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................*.............................................................................................|.........................*............................................................................................ + // mls v10.4s, v24.4s, v8.s[0] // ..............................*....................................................................................|..................................*................................................................................... + // sub v24.4s, v11.4s, v12.4s // ...................................................................................................................|.*.................................................................................................................... + // add v11.4s, v11.4s, v12.4s // ...................................................................................................................|...*.................................................................................................................. + // mul v12.4s, v24.4s, v2.s[0] // ....*..............................................................................................................|........*............................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........*........................................................................................................|..............*....................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................*...............................................................................................|.......................*.............................................................................................. + // sub v24.4s, v13.4s, v14.4s // ..........................e........................................................................................|..............................e....................................................................................... + // add v13.4s, v13.4s, v14.4s // .*.................................................................................................................|.....*................................................................................................................ + // mul v14.4s, v24.4s, v2.s[2] // ...................................................................................................................|..*................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................................................................................................................e..|....................................................................................................................e. + // mls v14.4s, v24.4s, v8.s[0] // ............*......................................................................................................|................*..................................................................................................... + // sub v24.4s, v15.4s, v16.4s // .................e.................................................................................................|.....................e................................................................................................ + // add v15.4s, v15.4s, v16.4s // ..................e................................................................................................|......................e............................................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ..............................................................................................................e....|..................................................................................................................e... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................................................................................e..........|............................................................................................................e......... + // mls v16.4s, v24.4s, v8.s[0] // ..................................................................................................................e|...................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ..............*....................................................................................................|..................*................................................................................................... + // add v9.4s, v9.4s, v11.4s // ...............*...................................................................................................|...................*.................................................................................................. + // mul v11.4s, v24.4s, v0.s[2] // .........................*.........................................................................................|.............................*........................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................*.......................................................................................|...............................*...................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ...............................*...................................................................................|...................................*.................................................................................. + // sub v24.4s, v10.4s, v12.4s // .................................*.................................................................................|.....................................*................................................................................ + // add v10.4s, v10.4s, v12.4s // ..................................*................................................................................|......................................*............................................................................... + // mul v12.4s, v24.4s, v0.s[2] // ....................................*..............................................................................|........................................*............................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................*...........................................................................|...........................................*.......................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ............................................*......................................................................|................................................*..................................................................... + // sub v24.4s, v13.4s, v15.4s // ...........*.......................................................................................................|...............*...................................................................................................... + // add v13.4s, v13.4s, v15.4s // .............*.....................................................................................................|.................*.................................................................................................... + // mul v15.4s, v24.4s, v1.s[0] // ............................*......................................................................................|................................*..................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................*.....................................................................................|.................................*.................................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ................................*..................................................................................|....................................*................................................................................. + // sub v24.4s, v14.4s, v16.4s // ....................*..............................................................................................|........................*............................................................................................. + // add v14.4s, v14.4s, v16.4s // ......................*............................................................................................|..........................*........................................................................................... + // mul v16.4s, v24.4s, v1.s[0] // ...................................*...............................................................................|.......................................*.............................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................*.........................................................................|.............................................*........................................................................ + // mls v16.4s, v24.4s, v8.s[0] // .............................................*.....................................................................|.................................................*.................................................................... + // sub v24.4s, v9.4s, v13.4s // .......................*...........................................................................................|...........................*.......................................................................................... + // add v9.4s, v9.4s, v13.4s // ........................*..........................................................................................|............................*......................................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ...........................................*.......................................................................|...............................................*...................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................*....................................................................|..................................................*................................................................... + // mls v13.4s, v24.4s, v8.s[0] // ....................................................*..............................................................|........................................................*............................................................. + // sub v24.4s, v10.4s, v14.4s // .....................................*.............................................................................|.........................................*............................................................................ + // add v10.4s, v10.4s, v14.4s // ......................................*............................................................................|..........................................*........................................................................... + // mul v14.4s, v24.4s, v0.s[0] // ...............................................*...................................................................|...................................................*.................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................*.............................................................|.........................................................*............................................................ + // mls v14.4s, v24.4s, v8.s[0] // ..........................................................*........................................................|..............................................................*....................................................... + // sub v24.4s, v11.4s, v15.4s // ........................................*..........................................................................|............................................*......................................................................... + // add v11.4s, v11.4s, v15.4s // ..........................................*........................................................................|..............................................*....................................................................... + // mul v15.4s, v24.4s, v0.s[0] // ......................................................*............................................................|..........................................................*........................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................*..........................................................|............................................................*......................................................... + // mls v15.4s, v24.4s, v8.s[0] // .............................................................*.....................................................|.................................................................*.................................................... + // sub v24.4s, v12.4s, v16.4s // ................................................*..................................................................|....................................................*................................................................. + // add v12.4s, v12.4s, v16.4s // .................................................*.................................................................|.....................................................*................................................................ + // mul v16.4s, v24.4s, v0.s[0] // ............................................................*......................................................|................................................................*..................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................*...................................................|...................................................................*.................................................. + // mls v16.4s, v24.4s, v8.s[0] // ......................................................................*............................................|..........................................................................*........................................... + // cmge v27.4s, v31.4s, v13.4s // .......................................................*...........................................................|...........................................................*.......................................................... + // cmge v28.4s, v13.4s, v30.4s // .........................................................*.........................................................|.............................................................*........................................................ + // sub v28.4s, v27.4s, v28.4s // ...........................................................*.......................................................|...............................................................*...................................................... + // mls v13.4s, v28.4s, v8.4s // .................................................................*.................................................|.....................................................................*................................................ + // cmge v27.4s, v31.4s, v14.4s // ..............................................................*....................................................|..................................................................*................................................... + // cmge v28.4s, v14.4s, v30.4s // ................................................................*..................................................|....................................................................*................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................*................................................|......................................................................*............................................... + // mls v14.4s, v28.4s, v8.4s // ........................................................................*..........................................|............................................................................*......................................... + // cmge v27.4s, v31.4s, v15.4s // ....................................................................*..............................................|........................................................................*............................................. + // cmge v28.4s, v15.4s, v30.4s // .....................................................................*.............................................|.........................................................................*............................................ + // sub v28.4s, v27.4s, v28.4s // .........................................................................*.........................................|.............................................................................*........................................ + // mls v15.4s, v28.4s, v8.4s // ..............................................................................*....................................|..................................................................................*................................... + // cmge v27.4s, v31.4s, v16.4s // .............................................................................*.....................................|.................................................................................*.................................... + // cmge v28.4s, v16.4s, v30.4s // ...............................................................................*...................................|...................................................................................*.................................. + // sub v28.4s, v27.4s, v28.4s // ...................................................................................*...............................|.......................................................................................*.............................. + // mls v16.4s, v28.4s, v8.4s // ...............................................................................................*...................|...................................................................................................*.................. + // str q13, [x0, #(4*(1024/8))] // .......................................................................*...........................................|...........................................................................*.......................................... + // str q14, [x0, #(5*(1024/8))] // ................................................................................*..................................|....................................................................................*................................. + // str q15, [x0, #(6*(1024/8))] // .....................................................................................*.............................|.........................................................................................*............................ + // str q16, [x0, #(7*(1024/8))] // ......................................................................................................*............|..........................................................................................................*........... + // mul v13.4s, v9.4s, v25.4s // ..................................................*................................................................|......................................................*............................................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ...................................................*...............................................................|.......................................................*.............................................................. + // mls v13.4s, v9.4s, v8.s[0] // ...................................................................*...............................................|.......................................................................*.............................................. + // mul v14.4s, v10.4s, v25.4s // ............................................................................*......................................|................................................................................*..................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ..................................................................................*................................|......................................................................................*............................... + // mls v14.4s, v10.4s, v8.s[0] // .......................................................................................*...........................|...........................................................................................*.......................... + // mul v15.4s, v11.4s, v25.4s // ....................................................................................*..............................|........................................................................................*............................. + // sqrdmulh v11.4s, v11.4s, v26.4s // ......................................................................................*............................|..........................................................................................*........................... + // mls v15.4s, v11.4s, v8.s[0] // .........................................................................................*.........................|.............................................................................................*........................ + // mul v16.4s, v12.4s, v25.4s // ...........................................................................................*.......................|...............................................................................................*...................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ........................................................................................*..........................|............................................................................................*......................... + // mls v16.4s, v12.4s, v8.s[0] // .............................................................................................*.....................|.................................................................................................*.................... + // cmge v27.4s, v31.4s, v13.4s // ..........................................................................*........................................|..............................................................................*....................................... + // cmge v28.4s, v13.4s, v30.4s // ...........................................................................*.......................................|...............................................................................*...................................... + // sub v28.4s, v27.4s, v28.4s // .................................................................................*.................................|.....................................................................................*................................ + // mls v13.4s, v28.4s, v8.4s // ..................................................................................................*................|......................................................................................................*............... + // cmge v27.4s, v31.4s, v14.4s // ..........................................................................................*........................|..............................................................................................*....................... + // cmge v28.4s, v14.4s, v30.4s // ............................................................................................*......................|................................................................................................*..................... + // sub v28.4s, v27.4s, v28.4s // ..............................................................................................*....................|..................................................................................................*................... + // mls v14.4s, v28.4s, v8.4s // ....................................................................................................*..............|........................................................................................................*............. + // cmge v27.4s, v31.4s, v15.4s // ................................................................................................*..................|....................................................................................................*................. + // cmge v28.4s, v15.4s, v30.4s // .................................................................................................*.................|.....................................................................................................*................ + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................*...........|...........................................................................................................*.......... + // mls v15.4s, v28.4s, v8.4s // ...........................................................................................................*.......|...............................................................................................................*...... + // cmge v27.4s, v31.4s, v16.4s // ...................................................................................................*...............|.......................................................................................................*.............. + // cmge v28.4s, v16.4s, v30.4s // .....................................................................................................*.............|.........................................................................................................*............ + // sub v28.4s, v27.4s, v28.4s // ..........................................................................................................*........|..............................................................................................................*....... + // mls v16.4s, v28.4s, v8.4s // .............................................................................................................*.....|.................................................................................................................*.... + // str q13, [x0], #(16) // .........................................................................................................*.........|.............................................................................................................*........ + // str q14, [x0, #(-16 + 1*(1024/8))] // ............................................................................................................*......|................................................................................................................*..... + // str q15, [x0, #(-16 + 2*(1024/8))] // ...............................................................................................................*...|...................................................................................................................*.. + // str q16, [x0, #(-16 + 3*(1024/8))] // .................................................................................................................*.|.....................................................................................................................* sub count, count, #1 cbnz count, layer123_start - mul v17.4S, v24.4S, v2.S[2] // *....................................................................................... - add v21.4S, v15.4S, v29.4S // .....*.................................................................................. - ldr q14, [x0, #0] // .*...................................................................................... - add v24.4S, v13.4S, v30.4S // .......*................................................................................ - ldr q28, [x0, #128] // ..*..................................................................................... - ldr q6, [x0, #256] // ...*.................................................................................... - mls v17.4S, v31.4S, v8.S[0] // ......*................................................................................. - ldr q31, [x0, #384] // ....*................................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v15.4S, v12.4S, v3.S[0] // ........*............................................................................... - sub v29.4S, v21.4S, v24.4S // ...............*........................................................................ - // gap // ........................................................................................ - add v21.4S, v21.4S, v24.4S // ................*....................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v15.4S, v7.4S, v8.S[0] // ...........*............................................................................ - sub v24.4S, v14.4S, v28.4S // .........*.............................................................................. - // gap // ........................................................................................ - add v14.4S, v14.4S, v28.4S // ..........*............................................................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - sub v28.4S, v6.4S, v31.4S // ............*........................................................................... - mul v13.4S, v29.4S, v1.S[0] // ..............................*......................................................... - // gap // ........................................................................................ - add v6.4S, v6.4S, v31.4S // .............*.......................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v31.4S, v24.4S, v1.S[2] // ..............*......................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - sub v16.4S, v17.4S, v15.4S // ...................*.................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - add v17.4S, v17.4S, v15.4S // ....................*................................................................... - sqrdmulh v24.4S, v24.4S, v1.S[3] // .................*...................................................................... - // gap // ........................................................................................ - sub v15.4S, v14.4S, v6.4S // ..................*..................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v29.4S, v29.4S, v1.S[1] // ................................*....................................................... - add v14.4S, v14.4S, v6.4S // ......................*................................................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v6.4S, v28.4S, v2.S[0] // .....................*.................................................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - sub v9.4S, v14.4S, v21.4S // .........................*.............................................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - add v21.4S, v14.4S, v21.4S // ..........................*............................................................. - sqrdmulh v14.4S, v28.4S, v2.S[1] // .......................*................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v31.4S, v24.4S, v8.S[0] // ........................*............................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v24.4S, v15.4S, v0.S[2] // ...........................*............................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v6.4S, v14.4S, v8.S[0] // ............................*........................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v13.4S, v29.4S, v8.S[0] // ..........................................*............................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v14.4S, v15.4S, v0.S[3] // .............................*.......................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - sub v28.4S, v31.4S, v6.4S // ...............................*........................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - add v6.4S, v31.4S, v6.4S // .................................*...................................................... - mul v31.4S, v16.4S, v1.S[0] // .......................................*................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v15.4S, v28.4S, v0.S[2] // .....................................*.................................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - sub v29.4S, v6.4S, v17.4S // ...................................*.................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v16.4S, v16.4S, v1.S[1] // ........................................*............................................... - add v17.4S, v6.4S, v17.4S // ....................................*................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v24.4S, v14.4S, v8.S[0] // ..................................*..................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v14.4S, v28.4S, v0.S[3] // ......................................*................................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v31.4S, v16.4S, v8.S[0] // ...........................................*............................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sub v28.4S, v24.4S, v13.4S // .............................................*.......................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - add v24.4S, v24.4S, v13.4S // ...............................................*........................................ - mul v6.4S, v9.4S, v0.S[0] // ............................................*........................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v15.4S, v14.4S, v8.S[0] // .........................................*.............................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v14.4S, v9.4S, v0.S[1] // ..............................................*......................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v13.4S, v29.4S, v0.S[0] // ..................................................*..................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - sub v16.4S, v15.4S, v31.4S // ................................................*....................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v29.4S, v29.4S, v0.S[1] // ...................................................*.................................... - add v31.4S, v15.4S, v31.4S // .................................................*...................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v15.4S, v21.4S, v25.4S // ....................................................*................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v21.4S, v21.4S, v26.4S // .....................................................*.................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v6.4S, v14.4S, v8.S[0] // ......................................................*................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v13.4S, v29.4S, v8.S[0] // .......................................................*................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v14.4S, v28.4S, v0.S[0] // ........................................................*............................... - // gap // ........................................................................................ - // gap // ........................................................................................ - srshr v29.4S, v6.4S, #23 // .........................................................*.............................. - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v28.4S, v28.4S, v0.S[1] // ..........................................................*............................. - // gap // ........................................................................................ - // gap // ........................................................................................ - srshr v9.4S, v13.4S, #23 // ...........................................................*............................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v30.4S, v16.4S, v0.S[0] // ............................................................*........................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v16.4S, v16.4S, v0.S[1] // .............................................................*.......................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v14.4S, v28.4S, v8.S[0] // ..............................................................*......................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v6.4S, v29.4S, v8.4S // ...............................................................*........................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v30.4S, v16.4S, v8.S[0] // ................................................................*....................... - // gap // ........................................................................................ - // gap // ........................................................................................ - srshr v28.4S, v14.4S, #23 // .................................................................*...................... - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v13.4S, v9.4S, v8.4S // ..................................................................*..................... - // gap // ........................................................................................ - // gap // ........................................................................................ - str q6, [x0, #512] // ...................................................................*.................... - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v14.4S, v28.4S, v8.4S // ....................................................................*................... - // gap // ........................................................................................ - // gap // ........................................................................................ - srshr v28.4S, v30.4S, #23 // .....................................................................*.................. - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v15.4S, v21.4S, v8.S[0] // ......................................................................*................. - // gap // ........................................................................................ - // gap // ........................................................................................ - str q13, [x0, #640] // .......................................................................*................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v30.4S, v28.4S, v8.4S // ........................................................................*............... - // gap // ........................................................................................ - // gap // ........................................................................................ - str q14, [x0, #768] // .........................................................................*.............. - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v21.4S, v17.4S, v25.4S // ..........................................................................*............. - // gap // ........................................................................................ - // gap // ........................................................................................ - str q15, [x0], #(16) // ...........................................................................*............ - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v17.4S, v17.4S, v26.4S // ............................................................................*........... - // gap // ........................................................................................ - // gap // ........................................................................................ - str q30, [x0, #880] // .............................................................................*.......... - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v14.4S, v24.4S, v25.4S // ..............................................................................*......... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v24.4S, v24.4S, v26.4S // ...............................................................................*........ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v21.4S, v17.4S, v8.S[0] // ................................................................................*....... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v17.4S, v31.4S, v26.4S // .................................................................................*...... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v14.4S, v24.4S, v8.S[0] // ..................................................................................*..... - // gap // ........................................................................................ - // gap // ........................................................................................ - str q21, [x0, #112] // ...................................................................................*.... - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v21.4S, v31.4S, v25.4S // ....................................................................................*... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v21.4S, v17.4S, v8.S[0] // .....................................................................................*.. - // gap // ........................................................................................ - // gap // ........................................................................................ - str q14, [x0, #240] // ......................................................................................*. - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - str q21, [x0, #368] // .......................................................................................* - // gap // ........................................................................................ - // gap // ........................................................................................ + sub v22.4S, v12.4S, v11.4S // ..*........................................................................................................ + mul v16.4S, v29.4S, v2.S[2] // ...*....................................................................................................... + ldr q4, [x0, #0] // *.......................................................................................................... + add v7.4S, v12.4S, v11.4S // ....*...................................................................................................... + ldr q19, [x0, #128] // .*......................................................................................................... + // gap // ........................................................................................................... + add v21.4S, v28.4S, v24.4S // .....*..................................................................................................... + mls v16.4S, v20.4S, v8.S[0] // ...........*............................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v12.4S, v22.4S, v2.S[0] // ......*.................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v11.4S, v21.4S, v18.4S // ..........*................................................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v21.4S, v21.4S, v18.4S // ............*.............................................................................................. + sqrdmulh v22.4S, v22.4S, v2.S[1] // .........*................................................................................................. + // gap // ........................................................................................................... + sub v13.4S, v16.4S, v5.4S // .................*......................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v16.4S, v16.4S, v5.4S // ...................*....................................................................................... + mul v17.4S, v11.4S, v1.S[0] // ........................*.................................................................................. + // gap // ........................................................................................................... + add v28.4S, v4.4S, v19.4S // ........*.................................................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v4.4S, v4.4S, v19.4S // .......*................................................................................................... + sqrdmulh v19.4S, v11.4S, v1.S[1] // .........................*................................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v12.4S, v22.4S, v8.S[0] // ................*.......................................................................................... + sub v22.4S, v28.4S, v7.4S // .............*............................................................................................. + // gap // ........................................................................................................... + add v7.4S, v28.4S, v7.4S // ..............*............................................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v11.4S, v4.4S, v1.S[2] // ...............*........................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v4.4S, v4.4S, v1.S[3] // ..................*........................................................................................ + sub v28.4S, v7.4S, v21.4S // ....................*...................................................................................... + // gap // ........................................................................................................... + add v7.4S, v7.4S, v21.4S // .....................*..................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v21.4S, v22.4S, v0.S[2] // ......................*.................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v22.4S, v22.4S, v0.S[3] // .......................*................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v24.4S, v13.4S, v1.S[0] // ...............................*........................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v13.4S, v13.4S, v1.S[1] // .....................................*..................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v11.4S, v4.4S, v8.S[0] // ..........................*................................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v17.4S, v19.4S, v8.S[0] // ............................*.............................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v21.4S, v22.4S, v8.S[0] // ...........................*............................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v4.4S, v11.4S, v12.4S // ..............................*............................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v22.4S, v11.4S, v12.4S // .............................*............................................................................. + mul v19.4S, v28.4S, v0.S[0] // .......................................*................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v24.4S, v13.4S, v8.S[0] // .........................................*................................................................. + sub v12.4S, v4.4S, v16.4S // .................................*......................................................................... + // gap // ........................................................................................................... + add v16.4S, v4.4S, v16.4S // ..................................*........................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v4.4S, v22.4S, v0.S[2] // ................................*.......................................................................... + sub v11.4S, v21.4S, v17.4S // ....................................*...................................................................... + // gap // ........................................................................................................... + add v21.4S, v21.4S, v17.4S // ......................................*.................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v22.4S, v22.4S, v0.S[3] // ...................................*....................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v13.4S, v28.4S, v0.S[1] // ..........................................*................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v17.4S, v12.4S, v0.S[0] // ...........................................*............................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v4.4S, v22.4S, v8.S[0] // ........................................*.................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v22.4S, v7.4S, v25.4S // ..............................................*............................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v7.4S, v7.4S, v26.4S // ...............................................*........................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v28.4S, v4.4S, v24.4S // ............................................*.............................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v19.4S, v13.4S, v8.S[0] // ................................................*.......................................................... + add v4.4S, v4.4S, v24.4S // .............................................*............................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v12.4S, v12.4S, v0.S[1] // .................................................*......................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v13.4S, v11.4S, v0.S[0] // ..................................................*........................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v24.4S, v31.4S, v19.4S // ...................................................*....................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v11.4S, v11.4S, v0.S[1] // ....................................................*...................................................... + cmge v14.4S, v19.4S, v30.4S // .....................................................*..................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v17.4S, v12.4S, v8.S[0] // ......................................................*.................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v12.4S, v24.4S, v14.4S // .......................................................*................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v24.4S, v28.4S, v0.S[0] // ........................................................*.................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v13.4S, v11.4S, v8.S[0] // .........................................................*................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v11.4S, v31.4S, v17.4S // ..........................................................*................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v28.4S, v28.4S, v0.S[1] // ...........................................................*............................................... + cmge v14.4S, v17.4S, v30.4S // ............................................................*.............................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v19.4S, v12.4S, v8.4S // .............................................................*............................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v12.4S, v11.4S, v14.4S // ..............................................................*............................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v22.4S, v7.4S, v8.S[0] // ...............................................................*........................................... + cmge v7.4S, v31.4S, v13.4S // ................................................................*.......................................... + // gap // ........................................................................................................... + cmge v11.4S, v13.4S, v30.4S // .................................................................*......................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v24.4S, v28.4S, v8.S[0] // ..................................................................*........................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q19, [x0, #512] // ...................................................................*....................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v17.4S, v12.4S, v8.4S // ....................................................................*...................................... + sub v7.4S, v7.4S, v11.4S // .....................................................................*..................................... + // gap // ........................................................................................................... + cmge v19.4S, v31.4S, v22.4S // ......................................................................*.................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v12.4S, v22.4S, v30.4S // .......................................................................*................................... + mul v11.4S, v16.4S, v25.4S // ........................................................................*.................................. + // gap // ........................................................................................................... + cmge v28.4S, v31.4S, v24.4S // .........................................................................*................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v16.4S, v16.4S, v26.4S // ..............................................................................*............................ + cmge v14.4S, v24.4S, v30.4S // ...........................................................................*............................... + // gap // ........................................................................................................... + str q17, [x0, #640] // ............................................................................*.............................. + sub v19.4S, v19.4S, v12.4S // .............................................................................*............................. + // gap // ........................................................................................................... + mul v12.4S, v21.4S, v25.4S // ................................................................................*.......................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v17.4S, v28.4S, v14.4S // ...............................................................................*........................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v21.4S, v21.4S, v26.4S // ..................................................................................*........................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v11.4S, v16.4S, v8.S[0] // ...................................................................................*....................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v16.4S, v4.4S, v26.4S // ....................................................................................*...................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v12.4S, v21.4S, v8.S[0] // .....................................................................................*..................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v21.4S, v31.4S, v11.4S // ......................................................................................*.................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v4.4S, v4.4S, v25.4S // .......................................................................................*................... + cmge v28.4S, v11.4S, v30.4S // ........................................................................................*.................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v4.4S, v16.4S, v8.S[0] // .........................................................................................*................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v16.4S, v21.4S, v28.4S // ..........................................................................................*................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v13.4S, v7.4S, v8.4S // ..........................................................................*................................ + cmge v7.4S, v31.4S, v12.4S // ............................................................................................*.............. + // gap // ........................................................................................................... + cmge v21.4S, v12.4S, v30.4S // .............................................................................................*............. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v24.4S, v17.4S, v8.4S // ...........................................................................................*............... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v17.4S, v31.4S, v4.4S // ...............................................................................................*........... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v22.4S, v19.4S, v8.4S // ..............................................................................................*............ + cmge v19.4S, v4.4S, v30.4S // .................................................................................................*......... + // gap // ........................................................................................................... + str q13, [x0, #768] // .................................................................................*......................... + sub v7.4S, v7.4S, v21.4S // ...................................................................................................*....... + // gap // ........................................................................................................... + mls v11.4S, v16.4S, v8.4S // ................................................................................................*.......... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q24, [x0, #896] // ..................................................................................................*........ + sub v16.4S, v17.4S, v19.4S // .....................................................................................................*..... + // gap // ........................................................................................................... + mls v12.4S, v7.4S, v8.4S // ......................................................................................................*.... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q22, [x0], #(16) // ....................................................................................................*...... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v4.4S, v16.4S, v8.4S // ........................................................................................................*.. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q11, [x0, #112] // .......................................................................................................*... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q12, [x0, #240] // .........................................................................................................*. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q4, [x0, #368] // ..........................................................................................................* + // gap // ........................................................................................................... + // gap // ........................................................................................................... // original source code - // mul v17.4S, v24.4S, v2.S[2] // *....................................................................................... - // ldr q21, [x0, #0] // ..*..................................................................................... - // ldr q14, [x0, #128] // ....*................................................................................... - // ldr q24, [x0, #256] // .....*.................................................................................. - // ldr q28, [x0, #384] // .......*................................................................................ - // add v6.4S, v15.4S, v29.4S // .*...................................................................................... - // mls v17.4S, v31.4S, v8.S[0] // ......*................................................................................. - // add v31.4S, v13.4S, v30.4S // ...*.................................................................................... - // mul v16.4S, v12.4S, v3.S[0] // ........*............................................................................... - // sub v9.4S, v21.4S, v14.4S // ............*........................................................................... - // add v21.4S, v21.4S, v14.4S // .............*.......................................................................... - // mls v16.4S, v7.4S, v8.S[0] // ...........*............................................................................ - // sub v14.4S, v24.4S, v28.4S // ..............*......................................................................... - // add v24.4S, v24.4S, v28.4S // ................*....................................................................... - // mul v28.4S, v9.4S, v1.S[2] // .................*...................................................................... - // sub v7.4S, v6.4S, v31.4S // .........*.............................................................................. - // add v6.4S, v6.4S, v31.4S // ..........*............................................................................. - // sqrdmulh v31.4S, v9.4S, v1.S[3] // ....................*................................................................... - // sub v9.4S, v21.4S, v24.4S // .....................*.................................................................. - // sub v18.4S, v17.4S, v16.4S // ..................*..................................................................... - // add v17.4S, v17.4S, v16.4S // ...................*.................................................................... - // mul v16.4S, v14.4S, v2.S[0] // ........................*............................................................... - // add v21.4S, v21.4S, v24.4S // .......................*................................................................ - // sqrdmulh v14.4S, v14.4S, v2.S[1] // ...........................*............................................................ - // mls v28.4S, v31.4S, v8.S[0] // ............................*........................................................... - // sub v31.4S, v21.4S, v6.4S // .........................*.............................................................. - // add v21.4S, v21.4S, v6.4S // ..........................*............................................................. - // mul v6.4S, v9.4S, v0.S[2] // .............................*.......................................................... - // mls v16.4S, v14.4S, v8.S[0] // ..............................*......................................................... - // sqrdmulh v14.4S, v9.4S, v0.S[3] // ................................*....................................................... - // mul v9.4S, v7.4S, v1.S[0] // ...............*........................................................................ - // sub v23.4S, v28.4S, v16.4S // .................................*...................................................... - // sqrdmulh v7.4S, v7.4S, v1.S[1] // ......................*................................................................. - // add v28.4S, v28.4S, v16.4S // ..................................*..................................................... - // mls v6.4S, v14.4S, v8.S[0] // ........................................*............................................... - // sub v14.4S, v28.4S, v17.4S // .....................................*.................................................. - // add v17.4S, v28.4S, v17.4S // .......................................*................................................ - // mul v28.4S, v23.4S, v0.S[2] // ....................................*................................................... - // sqrdmulh v16.4S, v23.4S, v0.S[3] // .........................................*.............................................. - // mul v23.4S, v18.4S, v1.S[0] // ...................................*.................................................... - // sqrdmulh v18.4S, v18.4S, v1.S[1] // ......................................*................................................. - // mls v28.4S, v16.4S, v8.S[0] // ..............................................*......................................... - // mls v9.4S, v7.4S, v8.S[0] // ...............................*........................................................ - // mls v23.4S, v18.4S, v8.S[0] // ..........................................*............................................. - // mul v16.4S, v31.4S, v0.S[0] // .............................................*.......................................... - // sub v7.4S, v6.4S, v9.4S // ...........................................*............................................ - // sqrdmulh v31.4S, v31.4S, v0.S[1] // ...............................................*........................................ - // add v6.4S, v6.4S, v9.4S // ............................................*........................................... - // sub v9.4S, v28.4S, v23.4S // .................................................*...................................... - // add v28.4S, v28.4S, v23.4S // ...................................................*.................................... - // mul v18.4S, v14.4S, v0.S[0] // ................................................*....................................... - // sqrdmulh v14.4S, v14.4S, v0.S[1] // ..................................................*..................................... - // mul v23.4S, v21.4S, v25.4S // ....................................................*................................... - // sqrdmulh v21.4S, v21.4S, v26.4S // .....................................................*.................................. - // mls v16.4S, v31.4S, v8.S[0] // ......................................................*................................. - // mls v18.4S, v14.4S, v8.S[0] // .......................................................*................................ - // mul v14.4S, v7.4S, v0.S[0] // ........................................................*............................... - // srshr v31.4S, v16.4S, #23 // .........................................................*.............................. - // sqrdmulh v7.4S, v7.4S, v0.S[1] // ..........................................................*............................. - // srshr v10.4S, v18.4S, #23 // ...........................................................*............................ - // mul v4.4S, v9.4S, v0.S[0] // ............................................................*........................... - // sqrdmulh v9.4S, v9.4S, v0.S[1] // .............................................................*.......................... - // mls v14.4S, v7.4S, v8.S[0] // ..............................................................*......................... - // mls v16.4S, v31.4S, v8.4S // ...............................................................*........................ - // mls v4.4S, v9.4S, v8.S[0] // ................................................................*....................... - // srshr v31.4S, v14.4S, #23 // .................................................................*...................... - // mls v18.4S, v10.4S, v8.4S // ..................................................................*..................... - // str q16, [x0, #512] // ...................................................................*.................... - // mls v14.4S, v31.4S, v8.4S // ....................................................................*................... - // srshr v31.4S, v4.4S, #23 // .....................................................................*.................. - // mls v23.4S, v21.4S, v8.S[0] // ......................................................................*................. - // str q18, [x0, #640] // .......................................................................*................ - // mls v4.4S, v31.4S, v8.4S // ........................................................................*............... - // str q14, [x0, #768] // .........................................................................*.............. - // mul v21.4S, v17.4S, v25.4S // ..........................................................................*............. - // str q23, [x0], #(16) // ...........................................................................*............ - // sqrdmulh v17.4S, v17.4S, v26.4S // ............................................................................*........... - // str q4, [x0, #880] // .............................................................................*.......... - // mul v14.4S, v6.4S, v25.4S // ..............................................................................*......... - // sqrdmulh v6.4S, v6.4S, v26.4S // ...............................................................................*........ - // mls v21.4S, v17.4S, v8.S[0] // ................................................................................*....... - // sqrdmulh v17.4S, v28.4S, v26.4S // .................................................................................*...... - // mls v14.4S, v6.4S, v8.S[0] // ..................................................................................*..... - // str q21, [x0, #112] // ...................................................................................*.... - // mul v21.4S, v28.4S, v25.4S // ....................................................................................*... - // mls v21.4S, v17.4S, v8.S[0] // .....................................................................................*.. - // str q14, [x0, #240] // ......................................................................................*. - // str q21, [x0, #368] // .......................................................................................* + // ldr q16, [x0, #0] // ..*........................................................................................................ + // ldr q4, [x0, #128] // ....*...................................................................................................... + // sub v7.4S, v12.4S, v11.4S // *.......................................................................................................... + // mul v19.4S, v29.4S, v2.S[2] // .*......................................................................................................... + // add v22.4S, v12.4S, v11.4S // ...*....................................................................................................... + // add v21.4S, v28.4S, v24.4S // .....*..................................................................................................... + // mul v17.4S, v7.4S, v2.S[0] // .......*................................................................................................... + // sub v29.4S, v16.4S, v4.4S // ...............*........................................................................................... + // add v16.4S, v16.4S, v4.4S // ..............*............................................................................................ + // sqrdmulh v4.4S, v7.4S, v2.S[1] // ..........*................................................................................................ + // sub v7.4S, v21.4S, v18.4S // ........*.................................................................................................. + // mls v19.4S, v20.4S, v8.S[0] // ......*.................................................................................................... + // add v21.4S, v21.4S, v18.4S // .........*................................................................................................. + // sub v20.4S, v16.4S, v22.4S // ..................*........................................................................................ + // add v16.4S, v16.4S, v22.4S // ...................*....................................................................................... + // mul v22.4S, v29.4S, v1.S[2] // ....................*...................................................................................... + // mls v17.4S, v4.4S, v8.S[0] // .................*......................................................................................... + // sub v4.4S, v19.4S, v5.4S // ...........*............................................................................................... + // sqrdmulh v13.4S, v29.4S, v1.S[3] // .....................*..................................................................................... + // add v19.4S, v19.4S, v5.4S // ............*.............................................................................................. + // sub v14.4S, v16.4S, v21.4S // ......................*.................................................................................... + // add v16.4S, v16.4S, v21.4S // .......................*................................................................................... + // mul v21.4S, v20.4S, v0.S[2] // ........................*.................................................................................. + // sqrdmulh v20.4S, v20.4S, v0.S[3] // .........................*................................................................................. + // mul v5.4S, v7.4S, v1.S[0] // .............*............................................................................................. + // sqrdmulh v7.4S, v7.4S, v1.S[1] // ................*.......................................................................................... + // mls v22.4S, v13.4S, v8.S[0] // ............................*.............................................................................. + // mls v21.4S, v20.4S, v8.S[0] // ..............................*............................................................................ + // mls v5.4S, v7.4S, v8.S[0] // .............................*............................................................................. + // sub v7.4S, v22.4S, v17.4S // ................................*.......................................................................... + // add v22.4S, v22.4S, v17.4S // ...............................*........................................................................... + // mul v13.4S, v4.4S, v1.S[0] // ..........................*................................................................................ + // mul v17.4S, v7.4S, v0.S[2] // .....................................*..................................................................... + // sub v20.4S, v22.4S, v19.4S // ...................................*....................................................................... + // add v22.4S, v22.4S, v19.4S // ....................................*...................................................................... + // sqrdmulh v7.4S, v7.4S, v0.S[3] // ........................................*.................................................................. + // sub v19.4S, v21.4S, v5.4S // ......................................*.................................................................... + // sqrdmulh v4.4S, v4.4S, v1.S[1] // ...........................*............................................................................... + // add v21.4S, v21.4S, v5.4S // .......................................*................................................................... + // mul v5.4S, v14.4S, v0.S[0] // .................................*......................................................................... + // mls v17.4S, v7.4S, v8.S[0] // ...........................................*............................................................... + // mls v13.4S, v4.4S, v8.S[0] // ..................................*........................................................................ + // sqrdmulh v4.4S, v14.4S, v0.S[1] // .........................................*................................................................. + // mul v7.4S, v20.4S, v0.S[0] // ..........................................*................................................................ + // sub v14.4S, v17.4S, v13.4S // ..............................................*............................................................ + // add v13.4S, v17.4S, v13.4S // ................................................*.......................................................... + // mul v17.4S, v16.4S, v25.4S // ............................................*.............................................................. + // sqrdmulh v16.4S, v16.4S, v26.4S // .............................................*............................................................. + // mls v5.4S, v4.4S, v8.S[0] // ...............................................*........................................................... + // sqrdmulh v4.4S, v20.4S, v0.S[1] // .................................................*......................................................... + // mul v20.4S, v19.4S, v0.S[0] // ..................................................*........................................................ + // cmge v10.4S, v31.4S, v5.4S // ...................................................*....................................................... + // sqrdmulh v19.4S, v19.4S, v0.S[1] // ....................................................*...................................................... + // cmge v9.4S, v5.4S, v30.4S // .....................................................*..................................................... + // mls v7.4S, v4.4S, v8.S[0] // ......................................................*.................................................... + // sub v4.4S, v10.4S, v9.4S // .......................................................*................................................... + // mul v10.4S, v14.4S, v0.S[0] // ........................................................*.................................................. + // mls v20.4S, v19.4S, v8.S[0] // .........................................................*................................................. + // cmge v19.4S, v31.4S, v7.4S // ..........................................................*................................................ + // sqrdmulh v14.4S, v14.4S, v0.S[1] // ...........................................................*............................................... + // cmge v9.4S, v7.4S, v30.4S // ............................................................*.............................................. + // mls v5.4S, v4.4S, v8.4S // .............................................................*............................................. + // sub v4.4S, v19.4S, v9.4S // ..............................................................*............................................ + // mls v17.4S, v16.4S, v8.S[0] // ...............................................................*........................................... + // cmge v16.4S, v31.4S, v20.4S // ................................................................*.......................................... + // cmge v19.4S, v20.4S, v30.4S // .................................................................*......................................... + // mls v10.4S, v14.4S, v8.S[0] // ..................................................................*........................................ + // str q5, [x0, #512] // ...................................................................*....................................... + // mls v7.4S, v4.4S, v8.4S // ....................................................................*...................................... + // sub v16.4S, v16.4S, v19.4S // .....................................................................*..................................... + // cmge v4.4S, v31.4S, v17.4S // ......................................................................*.................................... + // cmge v19.4S, v17.4S, v30.4S // .......................................................................*................................... + // mul v14.4S, v22.4S, v25.4S // ........................................................................*.................................. + // cmge v5.4S, v31.4S, v10.4S // .........................................................................*................................. + // mls v20.4S, v16.4S, v8.4S // .........................................................................................*................. + // cmge v16.4S, v10.4S, v30.4S // ...........................................................................*............................... + // str q7, [x0, #640] // ............................................................................*.............................. + // sub v4.4S, v4.4S, v19.4S // .............................................................................*............................. + // sqrdmulh v7.4S, v22.4S, v26.4S // ..........................................................................*................................ + // sub v16.4S, v5.4S, v16.4S // ...............................................................................*........................... + // mul v22.4S, v21.4S, v25.4S // ..............................................................................*............................ + // str q20, [x0, #768] // ................................................................................................*.......... + // sqrdmulh v19.4S, v21.4S, v26.4S // ................................................................................*.......................... + // mls v14.4S, v7.4S, v8.S[0] // .................................................................................*......................... + // sqrdmulh v7.4S, v13.4S, v26.4S // ..................................................................................*........................ + // mls v22.4S, v19.4S, v8.S[0] // ...................................................................................*....................... + // cmge v19.4S, v31.4S, v14.4S // ....................................................................................*...................... + // mul v21.4S, v13.4S, v25.4S // .....................................................................................*..................... + // cmge v13.4S, v14.4S, v30.4S // ......................................................................................*.................... + // mls v21.4S, v7.4S, v8.S[0] // .......................................................................................*................... + // sub v7.4S, v19.4S, v13.4S // ........................................................................................*.................. + // mls v10.4S, v16.4S, v8.4S // ............................................................................................*.............. + // cmge v16.4S, v31.4S, v22.4S // ..........................................................................................*................ + // cmge v19.4S, v22.4S, v30.4S // ...........................................................................................*............... + // mls v17.4S, v4.4S, v8.4S // ..............................................................................................*............ + // cmge v4.4S, v31.4S, v21.4S // .............................................................................................*............. + // mls v14.4S, v7.4S, v8.4S // ..................................................................................................*........ + // cmge v7.4S, v21.4S, v30.4S // ...............................................................................................*........... + // str q10, [x0, #896] // ...................................................................................................*....... + // sub v16.4S, v16.4S, v19.4S // .................................................................................................*......... + // str q17, [x0], #(16) // ......................................................................................................*.... + // sub v4.4S, v4.4S, v7.4S // ....................................................................................................*...... + // mls v22.4S, v16.4S, v8.4S // .....................................................................................................*..... + // str q14, [x0, #112] // ........................................................................................................*.. + // mls v21.4S, v4.4S, v8.4S // .......................................................................................................*... + // str q22, [x0, #240] // .........................................................................................................*. + // str q21, [x0, #368] // ..........................................................................................................* pop_stack diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_firestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_firestorm.s index 43d7438..78aa47d 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_firestorm.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_firestorm.s @@ -371,6 +371,8 @@ _intt_dilithium_123_45678_opt_m1_firestorm: consts .req v8 qform_consts .req q8 + modulus .req v29 + ASM_LOAD(r_ptr0, roots_l345) ASM_LOAD(r_ptr1, roots_l67) @@ -393,1823 +395,1940 @@ _intt_dilithium_123_45678_opt_m1_firestorm: qform_root3_tw .req q7 .p2align 2 - ldr q7, [x5, #80] // ..............*............................................................................................................... - ldr q26, [x5, #48] // .........*.................................................................................................................... - ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x1] // .......*...................................................................................................................... - ldr q25, [x5, #16] // ..........*................................................................................................................... - ld4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x2] // ................*............................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - ldr q9, [x5, #32] // ......*....................................................................................................................... - ldr q4, [x5, #144] // ....*......................................................................................................................... - ldr q28, [x5, #96] // ..*........................................................................................................................... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - ldr q5, [x5, #64] // ........*..................................................................................................................... - ldr q6, [x5, #160] // ............*................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - ldr q3, [x5, #176] // .....*........................................................................................................................ - ldr q29, [x5, #112] // ...*.......................................................................................................................... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - ldr q22, [x5, #128] // .....................*........................................................................................................ - ldr q31, [x5], #(12*16) // .............*................................................................................................................ - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - sub v2.4S, v12.4S, v13.4S // ...............................*.............................................................................................. - sub v1.4S, v20.4S, v21.4S // ...................*.......................................................................................................... - add v14.4S, v18.4S, v19.4S // ..................*........................................................................................................... - // gap // .............................................................................................................................. - sub v17.4S, v10.4S, v11.4S // ................................*............................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - add v23.4S, v12.4S, v13.4S // ..................................*........................................................................................... - sub v27.4S, v18.4S, v19.4S // .................*............................................................................................................ - add v0.4S, v10.4S, v11.4S // ....................................*......................................................................................... - add v11.4S, v20.4S, v21.4S // ....................*......................................................................................................... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - mul v24.4S, v2.4S, v6.4S // .....................................*........................................................................................ - sqrdmulh v30.4S, v2.4S, v3.4S // ......................................*....................................................................................... - sqrdmulh v19.4S, v17.4S, v4.4S // .......................................*...................................................................................... - mul v16.4S, v17.4S, v22.4S // ........................................*..................................................................................... - ldr q3, [x4, #16] // .*............................................................................................................................ - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - mul v15.4S, v27.4S, v9.4S // ..........................*................................................................................................... - sqrdmulh v27.4S, v27.4S, v26.4S // ...........................*.................................................................................................. - mul v9.4S, v1.4S, v5.4S // ........................*..................................................................................................... - sqrdmulh v1.4S, v1.4S, v7.4S // .........................*.................................................................................................... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - sub v10.4S, v0.4S, v23.4S // .........................................*.................................................................................... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - sub v6.4S, v14.4S, v11.4S // ......................*....................................................................................................... - // gap // .............................................................................................................................. - mls v24.4S, v30.4S, v8.S[0] // ................................................*............................................................................. - mls v16.4S, v19.4S, v8.S[0] // ...............................................*.............................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - mls v15.4S, v27.4S, v8.S[0] // .................................*............................................................................................ - mls v9.4S, v1.4S, v8.S[0] // ..............................*............................................................................................... - mul v19.4S, v10.4S, v28.4S // ..................................................*........................................................................... - sqrdmulh v10.4S, v10.4S, v29.4S // ......................................................*....................................................................... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - sqrdmulh v27.4S, v6.4S, v25.4S // .............................*................................................................................................ - // gap // .............................................................................................................................. - mul v21.4S, v6.4S, v31.4S // ............................*................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - add v26.4S, v0.4S, v23.4S // ..........................................*................................................................................... - // gap // .............................................................................................................................. - sub v2.4S, v16.4S, v24.4S // ....................................................*......................................................................... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - mls v19.4S, v10.4S, v8.S[0] // ..............................................................*............................................................... - sub v7.4S, v15.4S, v9.4S // ...........................................*.................................................................................. - add v13.4S, v15.4S, v9.4S // ............................................*................................................................................. - add v10.4S, v14.4S, v11.4S // .......................*...................................................................................................... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - mls v21.4S, v27.4S, v8.S[0] // ...................................*.......................................................................................... - sqrdmulh v9.4S, v2.4S, v29.4S // ...........................................................*.................................................................. - mul v27.4S, v2.4S, v28.4S // ..........................................................*................................................................... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - trn2 v12.4S, v10.4S, v13.4S // .......................................................*...................................................................... - trn1 v22.4S, v10.4S, v13.4S // .................................................*............................................................................ - sqrdmulh v4.4S, v7.4S, v25.4S // ..............................................*............................................................................... - ldr q29, [x4, #32] // ...............*.............................................................................................................. - mul v20.4S, v7.4S, v31.4S // .............................................*................................................................................ - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - add v1.4S, v16.4S, v24.4S // ...................................................*.......................................................................... - ldr q14, [x4, #48] // *............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - mls v27.4S, v9.4S, v8.S[0] // ...............................................................*.............................................................. - ldr q9, [x4], #64 // ...........*.................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - mls v20.4S, v4.4S, v8.S[0] // .....................................................*........................................................................ - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - trn2 v7.4S, v26.4S, v1.4S // .........................................................*.................................................................... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - trn2 v18.4S, v19.4S, v27.4S // ......................................................................*....................................................... - trn1 v16.4S, v26.4S, v1.4S // ........................................................*..................................................................... - trn1 v10.4S, v19.4S, v27.4S // .........................................................................*.................................................... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - trn2 v27.4S, v21.4S, v20.4S // ............................................................*................................................................. - trn1 v11.4S, v21.4S, v20.4S // .............................................................*................................................................ - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - trn2 v20.2D, v7.2D, v18.2D // ............................................................................*................................................. - trn1 v30.2D, v7.2D, v18.2D // .............................................................................*................................................ - trn2 v7.2D, v16.2D, v10.2D // ..............................................................................*............................................... - trn1 v2.2D, v16.2D, v10.2D // ...............................................................................*.............................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - trn2 v13.2D, v12.2D, v27.2D // ..................................................................*........................................................... - trn1 v31.2D, v12.2D, v27.2D // ...................................................................*.......................................................... - trn2 v19.2D, v22.2D, v11.2D // ................................................................*............................................................. - trn1 v0.2D, v22.2D, v11.2D // .................................................................*............................................................ - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - sub v22.4S, v2.4S, v30.4S // ..................................................................................*........................................... - sub v18.4S, v7.4S, v20.4S // .....................................................................................*........................................ - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - sub v11.4S, v19.4S, v13.4S // ....................................................................*......................................................... - sub v6.4S, v0.4S, v31.4S // ........................................................................*..................................................... - add v10.4S, v7.4S, v20.4S // ......................................................................................*....................................... - add v1.4S, v2.4S, v30.4S // ....................................................................................*......................................... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - mul v24.4S, v22.4S, v29.S[2] // ........................................................................................*..................................... - sqrdmulh v26.4S, v22.4S, v29.S[3] // .........................................................................................*.................................... - sqrdmulh v22.4S, v18.4S, v14.S[1] // ..........................................................................................*................................... - mul v15.4S, v18.4S, v14.S[0] // ...........................................................................................*.................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - mul v12.4S, v11.4S, v29.S[0] // ..........................................................................*................................................... - sqrdmulh v23.4S, v11.4S, v29.S[1] // ...........................................................................*.................................................. - sqrdmulh v11.4S, v6.4S, v3.S[3] // ................................................................................*............................................. - mul v6.4S, v6.4S, v3.S[2] // .................................................................................*............................................ - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - add v5.4S, v19.4S, v13.4S // .....................................................................*........................................................ - add v31.4S, v0.4S, v31.4S // .......................................................................*...................................................... - add v13.4S, v1.4S, v10.4S // ..............................................................................................*............................... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - mls v24.4S, v26.4S, v8.S[0] // .................................................................................................*............................ - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - mls v6.4S, v11.4S, v8.S[0] // ............................................................................................*................................. - mls v12.4S, v23.4S, v8.S[0] // .......................................................................................*...................................... - mls v15.4S, v22.4S, v8.S[0] // ....................................................................................................*......................... - add v27.4S, v31.4S, v5.4S // ...............................................................................................*.............................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - sub v2.4S, v31.4S, v5.4S // ...................................................................................*.......................................... - sub v10.4S, v1.4S, v10.4S // ........................................................................................................*..................... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - sub v18.4S, v27.4S, v13.4S // ..................................................................................................*........................... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - add v7.4S, v6.4S, v12.4S // ......................................................................................................*....................... - sub v30.4S, v6.4S, v12.4S // .....................................................................................................*........................ - add v28.4S, v24.4S, v15.4S // .............................................................................................................*................ - sqrdmulh v16.4S, v10.4S, v3.S[1] // .................................................................................................................*............ - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - sqrdmulh v26.4S, v18.4S, v9.S[1] // .........................................................................................................*.................... - mul v31.4S, v18.4S, v9.S[0] // ..........................................................................................................*................... - sub v18.4S, v24.4S, v15.4S // ............................................................................................................*................. - mul v4.4S, v10.4S, v3.S[0] // ................................................................................................................*............. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - sub v10.4S, v7.4S, v28.4S // ....................................................................................................................*......... - mul v22.4S, v30.4S, v9.S[2] // ..............................................................................................................*............... - mul v19.4S, v2.4S, v9.S[2] // ................................................................................................*............................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - sqrdmulh v0.4S, v18.4S, v3.S[1] // ..................................................................................................................*........... - mul v3.4S, v18.4S, v3.S[0] // ...................................................................................................................*.......... - sqrdmulh v18.4S, v30.4S, v9.S[3] // ...............................................................................................................*.............. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - mls v31.4S, v26.4S, v8.S[0] // .......................................................................................................................*...... - sqrdmulh v26.4S, v2.4S, v9.S[3] // .............................................................................................*................................ - mul v24.4S, v10.4S, v9.S[0] // ........................................................................................................................*..... - sqrdmulh v10.4S, v10.4S, v9.S[1] // .........................................................................................................................*.... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - add v2.4S, v27.4S, v13.4S // ...................................................................................................*.......................... - mls v4.4S, v16.4S, v8.S[0] // ...........................................................................................................................*.. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - mls v22.4S, v18.4S, v8.S[0] // ......................................................................................................................*....... - mls v3.4S, v0.4S, v8.S[0] // ..........................................................................................................................*... - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - mls v19.4S, v26.4S, v8.S[0] // .......................................................................................................*...................... - add v7.4S, v7.4S, v28.4S // .....................................................................................................................*........ - str q31, [x2], #(16*4) // ............................................................................................................................*. - mls v24.4S, v10.4S, v8.S[0] // .............................................................................................................................* - str q2, [x1], #(16*4) // ...........................................................................................................*.................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. - // gap // .............................................................................................................................. + ld4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x1] // ..*........................................................................................................................................... + ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x2] // ...........*.................................................................................................................................. + ldr q29, [x5, #32] // .*............................................................................................................................................ + ldr q5, [x5, #80] // ...*.......................................................................................................................................... + ldr q0, [x5], #(12*16) // *............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + ldr q2, [x5, #-128] // ....*......................................................................................................................................... + ldr q15, [x5, #-48] // ......*....................................................................................................................................... + ldr q13, [x4], #64 // .....*........................................................................................................................................ + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + ldr q18, [x5, #-16] // .......*...................................................................................................................................... + ldr q27, [x5, #-64] // ........*..................................................................................................................................... + ldr q17, [x5, #-176] // .........*.................................................................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + ldr q31, [x5, #-144] // ..........*................................................................................................................................... + ldr q25, [x5, #-32] // ....................*......................................................................................................................... + ldr q28, [x5, #-96] // ......................*....................................................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + ldr q3, [x5, #-80] // ..............................*............................................................................................................... + ldr q1, [x4, #-16] // ...................................................*.......................................................................................... + ldr q4, [x4, #-48] // ...................................*.......................................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + ldr q16, [x4, #-32] // ...................................................................*.......................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v6.4S, v19.4S, v20.4S // ..............*............................................................................................................................... + add v19.4S, v19.4S, v20.4S // ............*................................................................................................................................. + sub v20.4S, v21.4S, v22.4S // .............*................................................................................................................................ + add v22.4S, v21.4S, v22.4S // ...............*.............................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v23.4S, v9.4S, v10.4S // ........................*..................................................................................................................... + add v9.4S, v9.4S, v10.4S // .......................*...................................................................................................................... + sub v21.4S, v11.4S, v12.4S // .........................*.................................................................................................................... + add v12.4S, v11.4S, v12.4S // ..........................*................................................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mul v29.4S, v6.4S, v29.4S // ................*............................................................................................................................. + sqrdmulh v5.4S, v20.4S, v5.4S // ..................*........................................................................................................................... + mul v2.4S, v20.4S, v2.4S // .................*............................................................................................................................ + sqrdmulh v31.4S, v6.4S, v31.4S // ...................*.......................................................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sqrdmulh v15.4S, v23.4S, v15.4S // ...............................*.............................................................................................................. + sqrdmulh v18.4S, v21.4S, v18.4S // .................................*............................................................................................................ + mul v27.4S, v23.4S, v27.4S // ..................................*........................................................................................................... + mul v25.4S, v21.4S, v25.4S // ................................*............................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v20.4S, v19.4S, v22.4S // .....................*........................................................................................................................ + add v19.4S, v19.4S, v22.4S // ....................................*......................................................................................................... + sub v22.4S, v9.4S, v12.4S // ......................................*....................................................................................................... + add v9.4S, v9.4S, v12.4S // .................................................*............................................................................................ + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v29.4S, v31.4S, v8.S[0] // ...........................*.................................................................................................................. + mls v2.4S, v5.4S, v8.S[0] // ............................*................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v27.4S, v15.4S, v8.S[0] // ..........................................*................................................................................................... + mls v25.4S, v18.4S, v8.S[0] // ...........................................*.................................................................................................. + mul v5.4S, v20.4S, v0.4S // .............................*................................................................................................................ + sqrdmulh v12.4S, v20.4S, v17.4S // .....................................*........................................................................................................ + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sqrdmulh v15.4S, v22.4S, v3.4S // .........................................*.................................................................................................... + mul v18.4S, v22.4S, v28.4S // ............................................*................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v31.4S, v29.4S, v2.4S // ........................................*..................................................................................................... + add v29.4S, v29.4S, v2.4S // .......................................*...................................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v2.4S, v27.4S, v25.4S // ....................................................*......................................................................................... + add v27.4S, v27.4S, v25.4S // ......................................................*....................................................................................... + mls v5.4S, v12.4S, v8.S[0] // ..................................................*........................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mul v0.4S, v31.4S, v0.4S // ...............................................*.............................................................................................. + sqrdmulh v12.4S, v31.4S, v17.4S // ................................................*............................................................................................. + trn1 v17.4S, v19.4S, v29.4S // .............................................*................................................................................................ + trn2 v29.4S, v19.4S, v29.4S // ..............................................*............................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mul v19.4S, v2.4S, v28.4S // ..........................................................*................................................................................... + sqrdmulh v2.4S, v2.4S, v3.4S // ...........................................................*.................................................................................. + mls v18.4S, v15.4S, v8.S[0] // .....................................................*........................................................................................ + trn1 v15.4S, v9.4S, v27.4S // ........................................................*..................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + trn2 v9.4S, v9.4S, v27.4S // .........................................................*.................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v0.4S, v12.4S, v8.S[0] // .......................................................*...................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v19.4S, v2.4S, v8.S[0] // ..............................................................*............................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + trn1 v2.4S, v5.4S, v0.4S // ............................................................*................................................................................. + trn2 v5.4S, v5.4S, v0.4S // .............................................................*................................................................................ + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + trn1 v0.4S, v18.4S, v19.4S // .....................................................................*........................................................................ + trn2 v19.4S, v18.4S, v19.4S // ......................................................................*....................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + trn1 v12.2D, v17.2D, v2.2D // ...............................................................*.............................................................................. + trn2 v2.2D, v17.2D, v2.2D // .................................................................*............................................................................ + trn1 v18.2D, v29.2D, v5.2D // ................................................................*............................................................................. + trn2 v29.2D, v29.2D, v5.2D // ..................................................................*........................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + trn2 v5.2D, v15.2D, v0.2D // ........................................................................*..................................................................... + trn1 v0.2D, v15.2D, v0.2D // .........................................................................*.................................................................... + trn2 v15.2D, v9.2D, v19.2D // ...........................................................................*.................................................................. + trn1 v19.2D, v9.2D, v19.2D // ..........................................................................*................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v9.4S, v12.4S, v18.4S // ....................................................................*......................................................................... + add v12.4S, v12.4S, v18.4S // .............................................................................*................................................................ + sub v18.4S, v2.4S, v29.4S // .......................................................................*...................................................................... + add v29.4S, v2.4S, v29.4S // ...............................................................................*.............................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v2.4S, v5.4S, v15.4S // .................................................................................*............................................................ + add v5.4S, v5.4S, v15.4S // ....................................................................................*......................................................... + sub v15.4S, v0.4S, v19.4S // ................................................................................*............................................................. + add v19.4S, v0.4S, v19.4S // ..................................................................................*........................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sqrdmulh v0.4S, v9.4S, v4.S[3] // ............................................................................*................................................................. + mul v9.4S, v9.4S, v4.S[2] // ...................................................................................*.......................................................... + mul v27.4S, v18.4S, v16.S[0] // ..............................................................................*............................................................... + sqrdmulh v18.4S, v18.4S, v16.S[1] // .....................................................................................*........................................................ + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mul v17.4S, v2.4S, v1.S[0] // ........................................................................................*..................................................... + sqrdmulh v2.4S, v2.4S, v1.S[1] // .........................................................................................*.................................................... + sqrdmulh v31.4S, v15.4S, v16.S[3] // ......................................................................................*....................................................... + mul v15.4S, v15.4S, v16.S[2] // .......................................................................................*...................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v25.4S, v12.4S, v29.4S // ...............................................................................................*.............................................. + add v29.4S, v12.4S, v29.4S // ..........................................................................................*................................................... + sub v12.4S, v19.4S, v5.4S // ............................................................................................*................................................. + add v19.4S, v19.4S, v5.4S // ...........................................................................................*.................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v9.4S, v0.4S, v8.S[0] // .............................................................................................*................................................ + mls v27.4S, v18.4S, v8.S[0] // ..............................................................................................*............................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v17.4S, v2.4S, v8.S[0] // ...................................................................................................*.......................................... + mls v15.4S, v31.4S, v8.S[0] // ..................................................................................................*........................................... + srshr v5.4S, v29.4S, #23 // .................................................................................................*............................................ + srshr v0.4S, v19.4S, #23 // ................................................................................................*............................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mul v2.4S, v25.4S, v13.S[2] // .....................................................................................................*........................................ + sqrdmulh v18.4S, v25.4S, v13.S[3] // .............................................................................................................*................................ + sqrdmulh v31.4S, v12.4S, v4.S[1] // ....................................................................................................*......................................... + mul v12.4S, v12.4S, v4.S[0] // ........................................................................................................*..................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v25.4S, v9.4S, v27.4S // ......................................................................................................*....................................... + add v9.4S, v9.4S, v27.4S // .......................................................................................................*...................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v29.4S, v5.4S, v8.4S // ..........................................................................................................*................................... + mls v19.4S, v0.4S, v8.4S // .........................................................................................................*.................................... + sub v5.4S, v15.4S, v17.4S // ............................................................................................................*................................. + add v0.4S, v15.4S, v17.4S // ...........................................................................................................*.................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v12.4S, v31.4S, v8.S[0] // .....................................................................................................................*........................ + mul v15.4S, v25.4S, v13.S[2] // ..............................................................................................................*............................... + sqrdmulh v27.4S, v25.4S, v13.S[3] // ...............................................................................................................*.............................. + srshr v17.4S, v9.4S, #23 // ................................................................................................................*............................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sqrdmulh v31.4S, v5.4S, v4.S[1] // .................................................................................................................*............................ + mul v5.4S, v5.4S, v4.S[0] // ..................................................................................................................*........................... + mls v2.4S, v18.4S, v8.S[0] // ......................................................................................................................*....................... + srshr v18.4S, v0.4S, #23 // ...................................................................................................................*.......................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + add v25.4S, v29.4S, v19.4S // ........................................................................................................................*..................... + sub v29.4S, v29.4S, v19.4S // ....................................................................................................................*......................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v9.4S, v17.4S, v8.4S // .......................................................................................................................*...................... + mls v15.4S, v27.4S, v8.S[0] // .........................................................................................................................*.................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v0.4S, v18.4S, v8.4S // ..........................................................................................................................*................... + mls v5.4S, v31.4S, v8.S[0] // ............................................................................................................................*................. + str q25, [x1], #(16*4) // ..............................................................................................................................*............... + mul v19.4S, v29.4S, v13.S[0] // ...........................................................................................................................*.................. + sqrdmulh v29.4S, v29.4S, v13.S[1] // .............................................................................................................................*................ + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v18.4S, v2.4S, v12.4S // ...............................................................................................................................*.............. + add v26.4S, v2.4S, v12.4S // .......................................................................................................................................*...... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v2.4S, v9.4S, v0.4S // ................................................................................................................................*............. + add v24.4S, v9.4S, v0.4S // ..................................................................................................................................*........... + mls v19.4S, v29.4S, v8.S[0] // ...................................................................................................................................*.......... + sub v29.4S, v15.4S, v5.4S // .................................................................................................................................*............ + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + add v5.4S, v15.4S, v5.4S // ......................................................................................................................................*....... + mul v11.4S, v18.4S, v13.S[0] // ....................................................................................................................................*......... + sqrdmulh v10.4S, v18.4S, v13.S[1] // .....................................................................................................................................*........ + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mul v6.4S, v2.4S, v13.S[0] // ........................................................................................................................................*..... + sqrdmulh v23.4S, v2.4S, v13.S[1] // .........................................................................................................................................*.... + mul v12.4S, v29.4S, v13.S[0] // ..........................................................................................................................................*... + sqrdmulh v14.4S, v29.4S, v13.S[1] // ...........................................................................................................................................*.. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + str q19, [x2], #(16*4) // ............................................................................................................................................*. + str q5, [x1, #-16] // .............................................................................................................................................* + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. // original source code - // ldr q14, [x4, #48] // ........................................................*..................................................................... - // ldr q31, [x4, #16] // ..........................*................................................................................................... - // ldr q13, [x5, #96] // .......*...................................................................................................................... - // ldr q11, [x5, #112] // ...........*.................................................................................................................. - // ldr q22, [x5, #144] // ......*....................................................................................................................... - // ldr q25, [x5, #176] // ..........*................................................................................................................... - // ldr q29, [x5, #32] // .....*........................................................................................................................ - // ld4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x1] // ..*........................................................................................................................... - // ldr q5, [x5, #64] // ........*..................................................................................................................... - // ldr q0, [x5, #48] // .*............................................................................................................................ - // ldr q28, [x5, #16] // ...*.......................................................................................................................... - // ldr q9, [x4], #64 // ..........................................................*................................................................... - // ldr q26, [x5, #160] // .........*.................................................................................................................... - // ldr q6, [x5], #(12*16) // .............*................................................................................................................ - // ldr q16, [x5, #-112] // *............................................................................................................................. - // ldr q21, [x4, #-32] // .....................................................*........................................................................ - // ld4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x2] // ....*......................................................................................................................... - // sub v24.4S, v17.4S, v18.4S // ...................*.......................................................................................................... - // add v15.4S, v17.4S, v18.4S // ................*............................................................................................................. - // sub v10.4S, v19.4S, v20.4S // ...............*.............................................................................................................. - // add v19.4S, v19.4S, v20.4S // .....................*........................................................................................................ - // ldr q27, [x5, #-64] // ............*................................................................................................................. - // sub v17.4S, v15.4S, v19.4S // ................................*............................................................................................. - // add v30.4S, v15.4S, v19.4S // ..............................................*............................................................................... - // mul v5.4S, v10.4S, v5.4S // .............................*................................................................................................ - // sqrdmulh v7.4S, v10.4S, v16.4S // ..............................*............................................................................................... - // mul v16.4S, v24.4S, v29.4S // ...........................*.................................................................................................. - // sqrdmulh v29.4S, v24.4S, v0.4S // ............................*................................................................................................. - // mul v23.4S, v17.4S, v6.4S // ........................................*..................................................................................... - // sqrdmulh v19.4S, v17.4S, v28.4S // .......................................*...................................................................................... - // mls v5.4S, v7.4S, v8.S[0] // ....................................*......................................................................................... - // sub v0.4S, v3.4S, v4.4S // ..............*............................................................................................................... - // sub v10.4S, v1.4S, v2.4S // .................*............................................................................................................ - // mls v16.4S, v29.4S, v8.S[0] // ...................................*.......................................................................................... - // add v3.4S, v3.4S, v4.4S // ..................*........................................................................................................... - // mls v23.4S, v19.4S, v8.S[0] // ...............................................*.............................................................................. - // add v20.4S, v1.4S, v2.4S // ....................*......................................................................................................... - // mul v1.4S, v0.4S, v26.4S // ......................*....................................................................................................... - // sqrdmulh v24.4S, v0.4S, v25.4S // .......................*...................................................................................................... - // sqrdmulh v0.4S, v10.4S, v22.4S // ........................*..................................................................................................... - // mul v4.4S, v10.4S, v27.4S // .........................*.................................................................................................... - // sub v18.4S, v20.4S, v3.4S // ...............................*.............................................................................................. - // add v12.4S, v20.4S, v3.4S // .........................................*.................................................................................... - // sub v17.4S, v16.4S, v5.4S // ............................................*................................................................................. - // add v7.4S, v16.4S, v5.4S // .............................................*................................................................................ - // mul v6.4S, v17.4S, v6.4S // ......................................................*....................................................................... - // sqrdmulh v27.4S, v17.4S, v28.4S // ....................................................*......................................................................... - // mls v4.4S, v0.4S, v8.S[0] // ..................................*........................................................................................... - // mls v1.4S, v24.4S, v8.S[0] // .................................*............................................................................................ - // trn1 v25.4S, v30.4S, v7.4S // ...................................................*.......................................................................... - // mul v5.4S, v18.4S, v13.4S // .....................................*........................................................................................ - // add v10.4S, v4.4S, v1.4S // .......................................................*...................................................................... - // sub v29.4S, v4.4S, v1.4S // ..........................................*................................................................................... - // mls v6.4S, v27.4S, v8.S[0] // ...........................................................*.................................................................. - // sqrdmulh v16.4S, v18.4S, v11.4S // ......................................*....................................................................................... - // trn2 v19.4S, v30.4S, v7.4S // ..................................................*........................................................................... - // trn1 v15.4S, v12.4S, v10.4S // ..............................................................*............................................................... - // trn2 v4.4S, v12.4S, v10.4S // ............................................................*................................................................. - // mul v27.4S, v29.4S, v13.4S // .................................................*............................................................................ - // sqrdmulh v11.4S, v29.4S, v11.4S // ................................................*............................................................................. - // trn2 v2.4S, v23.4S, v6.4S // ................................................................*............................................................. - // trn1 v24.4S, v23.4S, v6.4S // .................................................................*............................................................ - // mls v5.4S, v16.4S, v8.S[0] // ...........................................*.................................................................................. - // mls v27.4S, v11.4S, v8.S[0] // .........................................................*.................................................................... - // trn2 v20.2D, v25.2D, v24.2D // ........................................................................*..................................................... - // trn1 v18.2D, v25.2D, v24.2D // .........................................................................*.................................................... - // trn2 v0.2D, v19.2D, v2.2D // ......................................................................*....................................................... - // trn1 v13.2D, v19.2D, v2.2D // .......................................................................*...................................................... - // sub v24.4S, v20.4S, v0.4S // ............................................................................*................................................. - // add v29.4S, v20.4S, v0.4S // ........................................................................................*..................................... - // trn2 v2.4S, v5.4S, v27.4S // .............................................................*................................................................ - // add v1.4S, v18.4S, v13.4S // .........................................................................................*.................................... - // sub v18.4S, v18.4S, v13.4S // .............................................................................*................................................ - // trn1 v3.4S, v5.4S, v27.4S // ...............................................................*.............................................................. - // mul v17.4S, v24.4S, v21.S[0] // ....................................................................................*......................................... - // sqrdmulh v22.4S, v24.4S, v21.S[1] // .....................................................................................*........................................ - // trn2 v26.2D, v4.2D, v2.2D // ..................................................................*........................................................... - // trn1 v19.2D, v4.2D, v2.2D // ...................................................................*.......................................................... - // trn2 v25.2D, v15.2D, v3.2D // ....................................................................*......................................................... - // trn1 v30.2D, v15.2D, v3.2D // .....................................................................*........................................................ - // sqrdmulh v24.4S, v18.4S, v31.S[3] // ......................................................................................*....................................... - // mul v16.4S, v18.4S, v31.S[2] // .......................................................................................*...................................... - // sub v10.4S, v30.4S, v19.4S // ..........................................................................*................................................... - // sub v3.4S, v1.4S, v29.4S // ................................................................................................*............................. - // add v27.4S, v30.4S, v19.4S // ...............................................................................*.............................................. - // sub v2.4S, v25.4S, v26.4S // ...........................................................................*.................................................. - // add v0.4S, v25.4S, v26.4S // ..............................................................................*............................................... - // mls v17.4S, v22.4S, v8.S[0] // .............................................................................................*................................ - // mul v20.4S, v10.4S, v21.S[2] // ................................................................................*............................................. - // sqrdmulh v28.4S, v10.4S, v21.S[3] // .................................................................................*............................................ - // sqrdmulh v11.4S, v2.4S, v14.S[1] // ..................................................................................*........................................... - // mul v7.4S, v2.4S, v14.S[0] // ...................................................................................*.......................................... - // mls v16.4S, v24.4S, v8.S[0] // ............................................................................................*................................. - // sqrdmulh v10.4S, v3.4S, v9.S[3] // ..................................................................................................................*........... - // add v2.4S, v27.4S, v0.4S // ..........................................................................................*................................... - // add v14.4S, v1.4S, v29.4S // ...............................................................................................*.............................. - // mul v19.4S, v3.4S, v9.S[2] // .............................................................................................................*................ - // mls v20.4S, v28.4S, v8.S[0] // ...........................................................................................*.................................. - // sub v29.4S, v14.4S, v2.4S // ..................................................................................................*........................... - // add v3.4S, v14.4S, v2.4S // .....................................................................................................................*........ - // mls v7.4S, v11.4S, v8.S[0] // ..............................................................................................*............................... - // sub v14.4S, v16.4S, v17.4S // ....................................................................................................*......................... - // add v21.4S, v16.4S, v17.4S // ...................................................................................................*.......................... - // mls v19.4S, v10.4S, v8.S[0] // .........................................................................................................................*.... - // sub v10.4S, v27.4S, v0.4S // .................................................................................................*............................ - // sqrdmulh v2.4S, v29.4S, v9.S[1] // .......................................................................................................*...................... - // mul v5.4S, v29.4S, v9.S[0] // ........................................................................................................*..................... - // str q3, [x1], #(16*4) // .............................................................................................................................* - // sub v1.4S, v20.4S, v7.4S // .........................................................................................................*.................... - // add v18.4S, v20.4S, v7.4S // .....................................................................................................*........................ - // mul v22.4S, v14.4S, v9.S[2] // ............................................................................................................*................. - // sqrdmulh v25.4S, v14.4S, v9.S[3] // ................................................................................................................*............. - // mul v4.4S, v10.4S, v31.S[0] // ..........................................................................................................*................... - // sqrdmulh v29.4S, v10.4S, v31.S[1] // ......................................................................................................*....................... - // sqrdmulh v28.4S, v1.4S, v31.S[1] // ..............................................................................................................*............... - // mul v3.4S, v1.4S, v31.S[0] // ...............................................................................................................*.............. - // sub v0.4S, v21.4S, v18.4S // ...........................................................................................................*.................. - // add v7.4S, v21.4S, v18.4S // ..........................................................................................................................*... - // mls v22.4S, v25.4S, v8.S[0] // .......................................................................................................................*...... - // mls v5.4S, v2.4S, v8.S[0] // .................................................................................................................*............ - // mul v24.4S, v0.4S, v9.S[0] // ...................................................................................................................*.......... - // sqrdmulh v10.4S, v0.4S, v9.S[1] // ....................................................................................................................*......... - // mls v3.4S, v28.4S, v8.S[0] // ........................................................................................................................*..... - // mls v4.4S, v29.4S, v8.S[0] // ......................................................................................................................*....... - // str q5, [x2], #(16*4) // ...........................................................................................................................*.. - // mls v24.4S, v10.4S, v8.S[0] // ............................................................................................................................*. + // ldr q9, [x5], #(12*16) // ....*......................................................................................................................................... + // ldr q25, [x5, #-160] // ..*........................................................................................................................................... + // ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x1] // *............................................................................................................................................. + // ldr q26, [x5, #-112] // ...*.......................................................................................................................................... + // ldr q0, [x5, #-128] // .....*........................................................................................................................................ + // ldr q31, [x4], #64 // .......*...................................................................................................................................... + // ldr q23, [x5, #-48] // ......*....................................................................................................................................... + // ldr q7, [x5, #-16] // ........*..................................................................................................................................... + // ldr q2, [x5, #-64] // .........*.................................................................................................................................... + // ldr q6, [x5, #-176] // ..........*................................................................................................................................... + // ldr q1, [x5, #-144] // ...........*.................................................................................................................................. + // ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x2] // .*............................................................................................................................................ + // add v4.4S, v18.4S, v19.4S // ...................*.......................................................................................................................... + // sub v24.4S, v20.4S, v21.4S // ....................*......................................................................................................................... + // sub v11.4S, v18.4S, v19.4S // ..................*........................................................................................................................... + // add v12.4S, v20.4S, v21.4S // .....................*........................................................................................................................ + // mul v27.4S, v11.4S, v25.4S // ..........................*................................................................................................................... + // mul v22.4S, v24.4S, v0.4S // ............................*................................................................................................................. + // sqrdmulh v5.4S, v24.4S, v26.4S // ...........................*.................................................................................................................. + // sqrdmulh v13.4S, v11.4S, v1.4S // .............................*................................................................................................................ + // ldr q26, [x5, #-32] // ............*................................................................................................................................. + // sub v0.4S, v4.4S, v12.4S // ..................................*........................................................................................................... + // ldr q25, [x5, #-96] // .............*................................................................................................................................ + // add v20.4S, v14.4S, v15.4S // .......................*...................................................................................................................... + // sub v3.4S, v14.4S, v15.4S // ......................*....................................................................................................................... + // sub v10.4S, v16.4S, v17.4S // ........................*..................................................................................................................... + // add v24.4S, v16.4S, v17.4S // .........................*.................................................................................................................... + // mls v27.4S, v13.4S, v8.S[0] // ......................................*....................................................................................................... + // mls v22.4S, v5.4S, v8.S[0] // .......................................*...................................................................................................... + // mul v21.4S, v0.4S, v9.4S // ..........................................*................................................................................................... + // ldr q16, [x5, #-80] // ..............*............................................................................................................................... + // sqrdmulh v30.4S, v3.4S, v23.4S // ..............................*............................................................................................................... + // mul v1.4S, v10.4S, v26.4S // .................................*............................................................................................................ + // sqrdmulh v23.4S, v10.4S, v7.4S // ...............................*.............................................................................................................. + // mul v26.4S, v3.4S, v2.4S // ................................*............................................................................................................. + // ldr q3, [x4, #-48] // ................*............................................................................................................................. + // add v2.4S, v4.4S, v12.4S // ...................................*.......................................................................................................... + // sqrdmulh v19.4S, v0.4S, v6.4S // ...........................................*.................................................................................................. + // sub v14.4S, v20.4S, v24.4S // ....................................*......................................................................................................... + // add v10.4S, v27.4S, v22.4S // ...............................................*.............................................................................................. + // sub v7.4S, v27.4S, v22.4S // ..............................................*............................................................................................... + // sqrdmulh v13.4S, v14.4S, v16.4S // ............................................*................................................................................................. + // mls v26.4S, v30.4S, v8.S[0] // ........................................*..................................................................................................... + // mls v1.4S, v23.4S, v8.S[0] // .........................................*.................................................................................................... + // mul v5.4S, v14.4S, v25.4S // .............................................*................................................................................................ + // trn1 v11.4S, v2.4S, v10.4S // .....................................................*........................................................................................ + // trn2 v23.4S, v2.4S, v10.4S // ......................................................*....................................................................................... + // mul v29.4S, v7.4S, v9.4S // ...................................................*.......................................................................................... + // sqrdmulh v10.4S, v7.4S, v6.4S // ....................................................*......................................................................................... + // add v30.4S, v20.4S, v24.4S // .....................................*........................................................................................................ + // mls v21.4S, v19.4S, v8.S[0] // ..................................................*........................................................................................... + // ldr q24, [x4, #-16] // ...............*.............................................................................................................................. + // sub v28.4S, v26.4S, v1.4S // ................................................*............................................................................................. + // mls v5.4S, v13.4S, v8.S[0] // .........................................................*.................................................................................... + // add v14.4S, v26.4S, v1.4S // .................................................*............................................................................................ + // mls v29.4S, v10.4S, v8.S[0] // ............................................................*................................................................................. + // trn1 v6.4S, v30.4S, v14.4S // ..........................................................*................................................................................... + // trn2 v30.4S, v30.4S, v14.4S // ...........................................................*.................................................................................. + // mul v14.4S, v28.4S, v25.4S // .......................................................*...................................................................................... + // sqrdmulh v1.4S, v28.4S, v16.4S // ........................................................*..................................................................................... + // trn1 v19.4S, v21.4S, v29.4S // ..............................................................*............................................................................... + // trn2 v26.4S, v21.4S, v29.4S // ...............................................................*.............................................................................. + // mls v14.4S, v1.4S, v8.S[0] // .............................................................*................................................................................ + // trn1 v16.2D, v11.2D, v19.2D // ..................................................................*........................................................................... + // trn1 v20.2D, v23.2D, v26.2D // ....................................................................*......................................................................... + // trn2 v4.2D, v11.2D, v19.2D // ...................................................................*.......................................................................... + // trn2 v0.2D, v23.2D, v26.2D // .....................................................................*........................................................................ + // ldr q29, [x4, #-32] // .................*............................................................................................................................ + // sub v21.4S, v16.4S, v20.4S // ..........................................................................*................................................................... + // trn1 v10.4S, v5.4S, v14.4S // ................................................................*............................................................................. + // trn2 v14.4S, v5.4S, v14.4S // .................................................................*............................................................................ + // sub v7.4S, v4.4S, v0.4S // ............................................................................*................................................................. + // trn2 v23.2D, v6.2D, v10.2D // ......................................................................*....................................................................... + // trn1 v17.2D, v6.2D, v10.2D // .......................................................................*...................................................................... + // trn1 v10.2D, v30.2D, v14.2D // .........................................................................*.................................................................... + // trn2 v11.2D, v30.2D, v14.2D // ........................................................................*..................................................................... + // sqrdmulh v1.4S, v21.4S, v3.S[3] // ..................................................................................*........................................................... + // add v30.4S, v16.4S, v20.4S // ...........................................................................*.................................................................. + // mul v20.4S, v7.4S, v29.S[0] // ....................................................................................*......................................................... + // add v16.4S, v4.4S, v0.4S // .............................................................................*................................................................ + // sub v26.4S, v17.4S, v10.4S // ................................................................................*............................................................. + // sub v19.4S, v23.4S, v11.4S // ..............................................................................*............................................................... + // add v28.4S, v17.4S, v10.4S // .................................................................................*............................................................ + // mul v4.4S, v21.4S, v3.S[2] // ...................................................................................*.......................................................... + // add v14.4S, v23.4S, v11.4S // ...............................................................................*.............................................................. + // sqrdmulh v6.4S, v7.4S, v29.S[1] // .....................................................................................*........................................................ + // sqrdmulh v21.4S, v26.4S, v29.S[3] // ........................................................................................*..................................................... + // mul v11.4S, v26.4S, v29.S[2] // .........................................................................................*.................................................... + // mul v26.4S, v19.4S, v24.S[0] // ......................................................................................*....................................................... + // sqrdmulh v24.4S, v19.4S, v24.S[1] // .......................................................................................*...................................................... + // add v7.4S, v30.4S, v16.4S // ...........................................................................................*.................................................. + // add v23.4S, v28.4S, v14.4S // .............................................................................................*................................................ + // sub v10.4S, v28.4S, v14.4S // ............................................................................................*................................................. + // mls v4.4S, v1.4S, v8.S[0] // ..............................................................................................*............................................... + // mls v20.4S, v6.4S, v8.S[0] // ...............................................................................................*.............................................. + // sub v19.4S, v30.4S, v16.4S // ..........................................................................................*................................................... + // srshr v14.4S, v23.4S, #23 // ...................................................................................................*.......................................... + // srshr v30.4S, v7.4S, #23 // ..................................................................................................*........................................... + // mls v11.4S, v21.4S, v8.S[0] // .................................................................................................*............................................ + // mls v26.4S, v24.4S, v8.S[0] // ................................................................................................*............................................. + // sqrdmulh v5.4S, v10.4S, v3.S[1] // ......................................................................................................*....................................... + // mul v16.4S, v19.4S, v31.S[2] // ....................................................................................................*......................................... + // sub v9.4S, v4.4S, v20.4S // ........................................................................................................*..................................... + // add v1.4S, v4.4S, v20.4S // .........................................................................................................*.................................... + // mul v20.4S, v10.4S, v3.S[0] // .......................................................................................................*...................................... + // mls v23.4S, v14.4S, v8.4S // ...........................................................................................................*.................................. + // mls v7.4S, v30.4S, v8.4S // ..........................................................................................................*................................... + // add v21.4S, v11.4S, v26.4S // .............................................................................................................*................................ + // sub v22.4S, v11.4S, v26.4S // ............................................................................................................*................................. + // sqrdmulh v14.4S, v19.4S, v31.S[3] // .....................................................................................................*........................................ + // mul v4.4S, v9.4S, v31.S[2] // ...............................................................................................................*.............................. + // sqrdmulh v26.4S, v9.4S, v31.S[3] // ................................................................................................................*............................. + // srshr v24.4S, v1.4S, #23 // .................................................................................................................*............................ + // sqrdmulh v30.4S, v22.4S, v3.S[1] // ..................................................................................................................*........................... + // mul v15.4S, v22.4S, v3.S[0] // ...................................................................................................................*.......................... + // srshr v9.4S, v21.4S, #23 // .....................................................................................................................*........................ + // sub v10.4S, v7.4S, v23.4S // .......................................................................................................................*...................... + // mls v20.4S, v5.4S, v8.S[0] // ..............................................................................................................*............................... + // mls v16.4S, v14.4S, v8.S[0] // ....................................................................................................................*......................... + // mls v1.4S, v24.4S, v8.4S // ........................................................................................................................*..................... + // add v14.4S, v7.4S, v23.4S // ......................................................................................................................*....................... + // mls v4.4S, v26.4S, v8.S[0] // .........................................................................................................................*.................... + // mls v21.4S, v9.4S, v8.4S // ..........................................................................................................................*................... + // mul v7.4S, v10.4S, v31.S[0] // .............................................................................................................................*................ + // mls v15.4S, v30.4S, v8.S[0] // ...........................................................................................................................*.................. + // sqrdmulh v26.4S, v10.4S, v31.S[1] // ..............................................................................................................................*............... + // str q14, [x1], #(16*4) // ............................................................................................................................*................. + // sub v10.4S, v16.4S, v20.4S // ...............................................................................................................................*.............. + // sub v30.4S, v1.4S, v21.4S // .................................................................................................................................*............ + // sub v14.4S, v4.4S, v15.4S // ....................................................................................................................................*......... + // add v24.4S, v1.4S, v21.4S // ..................................................................................................................................*........... + // mls v7.4S, v26.4S, v8.S[0] // ...................................................................................................................................*.......... + // mul v11.4S, v10.4S, v31.S[0] // ......................................................................................................................................*....... + // sqrdmulh v10.4S, v10.4S, v31.S[1] // .......................................................................................................................................*...... + // add v2.4S, v4.4S, v15.4S // .....................................................................................................................................*........ + // add v26.4S, v16.4S, v20.4S // ................................................................................................................................*............. + // mul v6.4S, v30.4S, v31.S[0] // ........................................................................................................................................*..... + // sqrdmulh v23.4S, v30.4S, v31.S[1] // .........................................................................................................................................*.... + // mul v12.4S, v14.4S, v31.S[0] // ..........................................................................................................................................*... + // sqrdmulh v14.4S, v14.4S, v31.S[1] // ...........................................................................................................................................*.. + // str q7, [x2], #(16*4) // ............................................................................................................................................*. + // str q2, [x1, #-16] // .............................................................................................................................................* sub count, count, #1 layer45678_start: - ldr q14, [x4, #48] // .........................................................................e...................................................................... - add v18.4S, v22.4S, v3.4S // ..................................................................................................................................*............. - add v16.4S, v19.4S, v4.4S // .............................................................................................................................*.................. - sub v2.4S, v19.4S, v4.4S // ............................................................................................................................*................... - sub v12.4S, v22.4S, v3.4S // .................................................................................................................................*.............. - ldr q31, [x4, #16] // .......................................................................e........................................................................ - // gap // ................................................................................................................................................ - ldr q13, [x5, #96] // ............................e................................................................................................................... - ldr q11, [x5, #112] // .............................e.................................................................................................................. - ldr q22, [x5, #144] // ...............................e................................................................................................................ - ldr q25, [x5, #176] // .................................e.............................................................................................................. - str q7, [x1, #-48] // .......................................................................................................................................*........ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q16, [x1, #-32] // ........................................................................................................................................*....... - mul v10.4S, v12.4S, v9.S[0] // ...................................................................................................................................*............ - mul v3.4S, v2.4S, v9.S[0] // ..............................................................................................................................*................. - sqrdmulh v2.4S, v2.4S, v9.S[1] // ...............................................................................................................................*................ - ldr q29, [x5, #32] // ....e........................................................................................................................................... - str q18, [x1, #-16] // .........................................................................................................................................*...... - sqrdmulh v15.4S, v12.4S, v9.S[1] // ....................................................................................................................................*........... - add x1, x1, #64 // ..............................................................................................................................................*. - ld4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x1] // e............................................................................................................................................... - ldr q5, [x5, #64] // ......e......................................................................................................................................... - ldr q0, [x5, #48] // .....e.......................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr q28, [x5, #16] // ...e............................................................................................................................................ - ldr q9, [x4], #64 // ......................................................................e......................................................................... - str q24, [x2, #-48] // ...........................................................................................................................................*.... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr q26, [x5, #160] // ................................e............................................................................................................... - mls v3.4S, v2.4S, v8.S[0] // ................................................................................................................................*............... - mls v10.4S, v15.4S, v8.S[0] // .....................................................................................................................................*.......... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr q6, [x5], #(12*16) // ..e............................................................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr q16, [x5, #-112] // .......e........................................................................................................................................ - str q10, [x2, #-16] // .............................................................................................................................................*.. - str q3, [x2, #-32] // ............................................................................................................................................*... - add x2, x2, #64 // ...............................................................................................................................................* - ldr q21, [x4, #-32] // ........................................................................e....................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ld4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x2] // .e.............................................................................................................................................. - sub v24.4S, v17.4S, v18.4S // ........e....................................................................................................................................... - add v15.4S, v17.4S, v18.4S // .........e...................................................................................................................................... - sub v10.4S, v19.4S, v20.4S // .............e.................................................................................................................................. - add v19.4S, v19.4S, v20.4S // ..............e................................................................................................................................. - ldr q27, [x5, #-64] // ..............................e................................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v17.4S, v15.4S, v19.4S // ..................e............................................................................................................................. - add v30.4S, v15.4S, v19.4S // ...................e............................................................................................................................ - mul v5.4S, v10.4S, v5.4S // ...............e................................................................................................................................ - sqrdmulh v7.4S, v10.4S, v16.4S // ................e............................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v16.4S, v24.4S, v29.4S // ..........e..................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v29.4S, v24.4S, v0.4S // ...........e.................................................................................................................................... - mul v23.4S, v17.4S, v6.4S // ....................e........................................................................................................................... - sqrdmulh v19.4S, v17.4S, v28.4S // .....................e.......................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v5.4S, v7.4S, v8.S[0] // .................e.............................................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v0.4S, v3.4S, v4.4S // .......................................e........................................................................................................ - sub v10.4S, v1.4S, v2.4S // ..................................e............................................................................................................. - mls v16.4S, v29.4S, v8.S[0] // ............e................................................................................................................................... - add v3.4S, v3.4S, v4.4S // ........................................e....................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v23.4S, v19.4S, v8.S[0] // ......................e......................................................................................................................... - add v20.4S, v1.4S, v2.4S // ...................................e............................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v1.4S, v0.4S, v26.4S // .........................................e...................................................................................................... - sqrdmulh v24.4S, v0.4S, v25.4S // ..........................................e..................................................................................................... - sqrdmulh v0.4S, v10.4S, v22.4S // .....................................e.......................................................................................................... - mul v4.4S, v10.4S, v27.4S // ....................................e........................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v18.4S, v20.4S, v3.4S // ............................................e................................................................................................... - add v12.4S, v20.4S, v3.4S // .............................................e.................................................................................................. - sub v17.4S, v16.4S, v5.4S // .......................e........................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v7.4S, v16.4S, v5.4S // ........................e....................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v6.4S, v17.4S, v6.4S // .........................e...................................................................................................................... - sqrdmulh v27.4S, v17.4S, v28.4S // ..........................e..................................................................................................................... - mls v4.4S, v0.4S, v8.S[0] // ......................................e......................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v1.4S, v24.4S, v8.S[0] // ...........................................e.................................................................................................... - trn1 v25.4S, v30.4S, v7.4S // ......................................................e......................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v5.4S, v18.4S, v13.4S // ..............................................e................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v10.4S, v4.4S, v1.4S // ..................................................e............................................................................................. - sub v29.4S, v4.4S, v1.4S // .................................................e.............................................................................................. - mls v6.4S, v27.4S, v8.S[0] // ...........................e.................................................................................................................... - sqrdmulh v16.4S, v18.4S, v11.4S // ...............................................e................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - trn2 v19.4S, v30.4S, v7.4S // .......................................................e........................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - trn1 v15.4S, v12.4S, v10.4S // ..............................................................e................................................................................. - trn2 v4.4S, v12.4S, v10.4S // ...............................................................e................................................................................ - mul v27.4S, v29.4S, v13.4S // ...................................................e............................................................................................ - sqrdmulh v11.4S, v29.4S, v11.4S // ....................................................e........................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - trn2 v2.4S, v23.4S, v6.4S // .........................................................e...................................................................................... - trn1 v24.4S, v23.4S, v6.4S // ........................................................e....................................................................................... - mls v5.4S, v16.4S, v8.S[0] // ................................................e............................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v27.4S, v11.4S, v8.S[0] // .....................................................e.......................................................................................... - trn2 v20.2D, v25.2D, v24.2D // ..........................................................e..................................................................................... - trn1 v18.2D, v25.2D, v24.2D // ............................................................e................................................................................... - trn2 v0.2D, v19.2D, v2.2D // ...........................................................e.................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - trn1 v13.2D, v19.2D, v2.2D // .............................................................e.................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v24.4S, v20.4S, v0.4S // ...............................................................................e................................................................ - add v29.4S, v20.4S, v0.4S // ................................................................................e............................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - trn2 v2.4S, v5.4S, v27.4S // .................................................................e.............................................................................. - add v1.4S, v18.4S, v13.4S // ...........................................................................e.................................................................... - sub v18.4S, v18.4S, v13.4S // ..........................................................................e..................................................................... - trn1 v3.4S, v5.4S, v27.4S // ................................................................e............................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v17.4S, v24.4S, v21.S[0] // .................................................................................e.............................................................. - sqrdmulh v22.4S, v24.4S, v21.S[1] // ..................................................................................e............................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - trn2 v26.2D, v4.2D, v2.2D // ...................................................................e............................................................................ - trn1 v19.2D, v4.2D, v2.2D // .....................................................................e.......................................................................... - trn2 v25.2D, v15.2D, v3.2D // ..................................................................e............................................................................. - trn1 v30.2D, v15.2D, v3.2D // ....................................................................e........................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v24.4S, v18.4S, v31.S[3] // .............................................................................e.................................................................. - mul v16.4S, v18.4S, v31.S[2] // ............................................................................e................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v10.4S, v30.4S, v19.4S // ....................................................................................e........................................................... - sub v3.4S, v1.4S, v29.4S // ..............................................................................................e................................................. - add v27.4S, v30.4S, v19.4S // .....................................................................................e.......................................................... - sub v2.4S, v25.4S, v26.4S // .........................................................................................e...................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v0.4S, v25.4S, v26.4S // ..........................................................................................e..................................................... - mls v17.4S, v22.4S, v8.S[0] // ...................................................................................e............................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v20.4S, v10.4S, v21.S[2] // ......................................................................................e......................................................... - sqrdmulh v28.4S, v10.4S, v21.S[3] // .......................................................................................e........................................................ - sqrdmulh v11.4S, v2.4S, v14.S[1] // ............................................................................................e................................................... - mul v7.4S, v2.4S, v14.S[0] // ...........................................................................................e.................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v16.4S, v24.4S, v8.S[0] // ..............................................................................e................................................................. - sqrdmulh v10.4S, v3.4S, v9.S[3] // .................................................................................................e.............................................. - add v2.4S, v27.4S, v0.4S // .........................................................................................................e...................................... - add v14.4S, v1.4S, v29.4S // ...............................................................................................e................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v19.4S, v3.4S, v9.S[2] // ................................................................................................e............................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v20.4S, v28.4S, v8.S[0] // ........................................................................................e....................................................... - sub v29.4S, v14.4S, v2.4S // ..................................................................................................................e............................. - add v3.4S, v14.4S, v2.4S // ...................................................................................................................e............................ - mls v7.4S, v11.4S, v8.S[0] // .............................................................................................e.................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v14.4S, v16.4S, v17.4S // ...................................................................................................e............................................ - add v21.4S, v16.4S, v17.4S // ....................................................................................................e........................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v19.4S, v10.4S, v8.S[0] // ..................................................................................................e............................................. - sub v10.4S, v27.4S, v0.4S // ........................................................................................................e....................................... - sqrdmulh v2.4S, v29.4S, v9.S[1] // .....................................................................................................................e.......................... - mul v5.4S, v29.4S, v9.S[0] // ....................................................................................................................e........................... - str q3, [x1], #(16*4) // ......................................................................................................................................e......... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v1.4S, v20.4S, v7.4S // .............................................................................................................e.................................. - add v18.4S, v20.4S, v7.4S // ..............................................................................................................e................................. - mul v22.4S, v14.4S, v9.S[2] // .....................................................................................................e.......................................... - sqrdmulh v25.4S, v14.4S, v9.S[3] // ......................................................................................................e......................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v4.4S, v10.4S, v31.S[0] // ..........................................................................................................e..................................... - sqrdmulh v29.4S, v10.4S, v31.S[1] // ...........................................................................................................e.................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v28.4S, v1.4S, v31.S[1] // ................................................................................................................e............................... - mul v3.4S, v1.4S, v31.S[0] // ...............................................................................................................e................................ - sub v0.4S, v21.4S, v18.4S // .......................................................................................................................e........................ - add v7.4S, v21.4S, v18.4S // ........................................................................................................................e....................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v22.4S, v25.4S, v8.S[0] // .......................................................................................................e........................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v5.4S, v2.4S, v8.S[0] // ......................................................................................................................e......................... - mul v24.4S, v0.4S, v9.S[0] // .........................................................................................................................e...................... - sqrdmulh v10.4S, v0.4S, v9.S[1] // ..........................................................................................................................e..................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v3.4S, v28.4S, v8.S[0] // .................................................................................................................e.............................. - mls v4.4S, v29.4S, v8.S[0] // ............................................................................................................e................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q5, [x2], #(16*4) // ..........................................................................................................................................e..... - mls v24.4S, v10.4S, v8.S[0] // ...........................................................................................................................e.................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ + ldr q9, [x5], #(12*16) // ..e..................................................................................................................................................... + ldr q25, [x5, #-160] // ....e................................................................................................................................................... + mls v11.4S, v10.4S, v8.S[0] // ........................................................................................................................................*............... + str q26, [x1, #-32] // ................................................................................................................................................*....... + str q24, [x1, #-48] // ...............................................................................................................................................*........ + add x1, x1, #64 // ......................................................................................................................................................*. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x1] // e....................................................................................................................................................... + ldr q26, [x5, #-112] // .......e................................................................................................................................................ + ldr q0, [x5, #-128] // ......e................................................................................................................................................. + mls v12.4S, v14.4S, v8.S[0] // .............................................................................................................................................*.......... + mls v6.4S, v23.4S, v8.S[0] // ...................................................................................................................................*.................... + ldr q31, [x4], #64 // ......................................................................e................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q23, [x5, #-48] // ...............................e........................................................................................................................ + ldr q7, [x5, #-16] // .................................e...................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q11, [x2, #-32] // ....................................................................................................................................................*... + ldr q2, [x5, #-64] // ..............................e......................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q12, [x2, #-16] // .....................................................................................................................................................*.. + str q6, [x2, #-48] // ...................................................................................................................................................*.... + ldr q6, [x5, #-176] // ...e.................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add x2, x2, #64 // .......................................................................................................................................................* + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q1, [x5, #-144] // .....e.................................................................................................................................................. + // gap // ........................................................................................................................................................ + ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x2] // .e...................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v4.4S, v18.4S, v19.4S // .........e.............................................................................................................................................. + sub v24.4S, v20.4S, v21.4S // .............e.......................................................................................................................................... + sub v11.4S, v18.4S, v19.4S // ........e............................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v12.4S, v20.4S, v21.4S // ..............e......................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v27.4S, v11.4S, v25.4S // ..........e............................................................................................................................................. + mul v22.4S, v24.4S, v0.4S // ...............e........................................................................................................................................ + sqrdmulh v5.4S, v24.4S, v26.4S // ................e....................................................................................................................................... + sqrdmulh v13.4S, v11.4S, v1.4S // ...........e............................................................................................................................................ + ldr q26, [x5, #-32] // ................................e....................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v0.4S, v4.4S, v12.4S // ..................e..................................................................................................................................... + ldr q25, [x5, #-96] // ............................e........................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v20.4S, v14.4S, v15.4S // ...................................e.................................................................................................................... + sub v3.4S, v14.4S, v15.4S // ..................................e..................................................................................................................... + // gap // ........................................................................................................................................................ + sub v10.4S, v16.4S, v17.4S // .......................................e................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v24.4S, v16.4S, v17.4S // ........................................e............................................................................................................... + mls v27.4S, v13.4S, v8.S[0] // ............e........................................................................................................................................... + mls v22.4S, v5.4S, v8.S[0] // .................e...................................................................................................................................... + mul v21.4S, v0.4S, v9.4S // ....................e................................................................................................................................... + ldr q16, [x5, #-80] // .............................e.......................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v3.4S, v23.4S // .....................................e.................................................................................................................. + mul v1.4S, v10.4S, v26.4S // .........................................e.............................................................................................................. + sqrdmulh v23.4S, v10.4S, v7.4S // ..........................................e............................................................................................................. + // gap // ........................................................................................................................................................ + mul v26.4S, v3.4S, v2.4S // ....................................e................................................................................................................... + ldr q3, [x4, #-48] // .......................................................................e................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v2.4S, v4.4S, v12.4S // ...................e.................................................................................................................................... + sqrdmulh v19.4S, v0.4S, v6.4S // .....................e.................................................................................................................................. + sub v14.4S, v20.4S, v24.4S // ............................................e........................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v10.4S, v27.4S, v22.4S // ........................e............................................................................................................................... + sub v7.4S, v27.4S, v22.4S // .......................e................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v13.4S, v14.4S, v16.4S // ...............................................e........................................................................................................ + mls v26.4S, v30.4S, v8.S[0] // ......................................e................................................................................................................. + mls v1.4S, v23.4S, v8.S[0] // ...........................................e............................................................................................................ + mul v5.4S, v14.4S, v25.4S // ..............................................e......................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v11.4S, v2.4S, v10.4S // ......................................................e................................................................................................. + trn2 v23.4S, v2.4S, v10.4S // .......................................................e................................................................................................ + mul v29.4S, v7.4S, v9.4S // .........................e.............................................................................................................................. + sqrdmulh v10.4S, v7.4S, v6.4S // ..........................e............................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v30.4S, v20.4S, v24.4S // .............................................e.......................................................................................................... + mls v21.4S, v19.4S, v8.S[0] // ......................e................................................................................................................................. + ldr q24, [x4, #-16] // .........................................................................e.............................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v28.4S, v26.4S, v1.4S // .................................................e...................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v5.4S, v13.4S, v8.S[0] // ................................................e....................................................................................................... + add v14.4S, v26.4S, v1.4S // ..................................................e..................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v29.4S, v10.4S, v8.S[0] // ...........................e............................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v6.4S, v30.4S, v14.4S // ..............................................................e......................................................................................... + trn2 v30.4S, v30.4S, v14.4S // ...............................................................e........................................................................................ + mul v14.4S, v28.4S, v25.4S // ...................................................e.................................................................................................... + sqrdmulh v1.4S, v28.4S, v16.4S // ....................................................e................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v19.4S, v21.4S, v29.4S // ........................................................e............................................................................................... + trn2 v26.4S, v21.4S, v29.4S // .........................................................e.............................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v14.4S, v1.4S, v8.S[0] // .....................................................e.................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v16.2D, v11.2D, v19.2D // ............................................................e........................................................................................... + trn1 v20.2D, v23.2D, v26.2D // .............................................................e.......................................................................................... + trn2 v4.2D, v11.2D, v19.2D // ..........................................................e............................................................................................. + trn2 v0.2D, v23.2D, v26.2D // ...........................................................e............................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q29, [x4, #-32] // ........................................................................e............................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v21.4S, v16.4S, v20.4S // ..........................................................................e............................................................................. + trn1 v10.4S, v5.4S, v14.4S // ................................................................e....................................................................................... + trn2 v14.4S, v5.4S, v14.4S // .................................................................e...................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v7.4S, v4.4S, v0.4S // ...............................................................................e........................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v23.2D, v6.2D, v10.2D // ..................................................................e..................................................................................... + trn1 v17.2D, v6.2D, v10.2D // ....................................................................e................................................................................... + trn1 v10.2D, v30.2D, v14.2D // .....................................................................e.................................................................................. + trn2 v11.2D, v30.2D, v14.2D // ...................................................................e.................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v1.4S, v21.4S, v3.S[3] // .............................................................................e.......................................................................... + add v30.4S, v16.4S, v20.4S // ...........................................................................e............................................................................ + mul v20.4S, v7.4S, v29.S[0] // .................................................................................e...................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v16.4S, v4.4S, v0.4S // ................................................................................e....................................................................... + sub v26.4S, v17.4S, v10.4S // ....................................................................................e................................................................... + sub v19.4S, v23.4S, v11.4S // .........................................................................................e.............................................................. + add v28.4S, v17.4S, v10.4S // .....................................................................................e.................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v4.4S, v21.4S, v3.S[2] // ............................................................................e........................................................................... + add v14.4S, v23.4S, v11.4S // ..........................................................................................e............................................................. + sqrdmulh v6.4S, v7.4S, v29.S[1] // ..................................................................................e..................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v21.4S, v26.4S, v29.S[3] // .......................................................................................e................................................................ + mul v11.4S, v26.4S, v29.S[2] // ......................................................................................e................................................................. + mul v26.4S, v19.4S, v24.S[0] // ...........................................................................................e............................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v24.4S, v19.4S, v24.S[1] // ............................................................................................e........................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v7.4S, v30.4S, v16.4S // ...............................................................................................e........................................................ + add v23.4S, v28.4S, v14.4S // .........................................................................................................e.............................................. + sub v10.4S, v28.4S, v14.4S // ........................................................................................................e............................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v4.4S, v1.4S, v8.S[0] // ..............................................................................e......................................................................... + mls v20.4S, v6.4S, v8.S[0] // ...................................................................................e.................................................................... + sub v19.4S, v30.4S, v16.4S // ..............................................................................................e......................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + srshr v14.4S, v23.4S, #23 // ......................................................................................................................e................................. + srshr v30.4S, v7.4S, #23 // ..................................................................................................................e..................................... + mls v11.4S, v21.4S, v8.S[0] // ........................................................................................e............................................................... + mls v26.4S, v24.4S, v8.S[0] // .............................................................................................e.......................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v5.4S, v10.4S, v3.S[1] // ...........................................................................................................e............................................ + mul v16.4S, v19.4S, v31.S[2] // ................................................................................................e....................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v9.4S, v4.4S, v20.4S // ...................................................................................................e.................................................... + add v1.4S, v4.4S, v20.4S // ....................................................................................................e................................................... + mul v20.4S, v10.4S, v3.S[0] // ..........................................................................................................e............................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v23.4S, v14.4S, v8.4S // .......................................................................................................................e................................ + mls v7.4S, v30.4S, v8.4S // ...................................................................................................................e.................................... + add v21.4S, v11.4S, v26.4S // ..............................................................................................................e......................................... + sub v22.4S, v11.4S, v26.4S // .............................................................................................................e.......................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v14.4S, v19.4S, v31.S[3] // .................................................................................................e...................................................... + mul v4.4S, v9.4S, v31.S[2] // .....................................................................................................e.................................................. + sqrdmulh v26.4S, v9.4S, v31.S[3] // ......................................................................................................e................................................. + srshr v24.4S, v1.4S, #23 // ....................................................................................................................e................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v22.4S, v3.S[1] // ................................................................................................................e....................................... + mul v15.4S, v22.4S, v3.S[0] // ...............................................................................................................e........................................ + // gap // ........................................................................................................................................................ + srshr v9.4S, v21.4S, #23 // ........................................................................................................................e............................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v10.4S, v7.4S, v23.4S // ..........................................................................................................................e............................. + mls v20.4S, v5.4S, v8.S[0] // ............................................................................................................e........................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v16.4S, v14.4S, v8.S[0] // ..................................................................................................e..................................................... + mls v1.4S, v24.4S, v8.4S // .....................................................................................................................e.................................. + add v14.4S, v7.4S, v23.4S // ...........................................................................................................................e............................ + mls v4.4S, v26.4S, v8.S[0] // .......................................................................................................e................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v21.4S, v9.4S, v8.4S // .........................................................................................................................e.............................. + mul v7.4S, v10.4S, v31.S[0] // ............................................................................................................................e........................... + mls v15.4S, v30.4S, v8.S[0] // .................................................................................................................e...................................... + sqrdmulh v26.4S, v10.4S, v31.S[1] // .............................................................................................................................e.......................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q14, [x1], #(16*4) // ..............................................................................................................................................e......... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v10.4S, v16.4S, v20.4S // ....................................................................................................................................e................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v30.4S, v1.4S, v21.4S // ...............................................................................................................................e........................ + sub v14.4S, v4.4S, v15.4S // .........................................................................................................................................e.............. + add v24.4S, v1.4S, v21.4S // ................................................................................................................................e....................... + mls v7.4S, v26.4S, v8.S[0] // ..............................................................................................................................e......................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v11.4S, v10.4S, v31.S[0] // ......................................................................................................................................e................. + sqrdmulh v10.4S, v10.4S, v31.S[1] // .......................................................................................................................................e................ + add v2.4S, v4.4S, v15.4S // ..........................................................................................................................................e............. + add v26.4S, v16.4S, v20.4S // .....................................................................................................................................e.................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v6.4S, v30.4S, v31.S[0] // .................................................................................................................................e...................... + sqrdmulh v23.4S, v30.4S, v31.S[1] // ..................................................................................................................................e..................... + mul v12.4S, v14.4S, v31.S[0] // ...........................................................................................................................................e............ + sqrdmulh v14.4S, v14.4S, v31.S[1] // ............................................................................................................................................e........... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q7, [x2], #(16*4) // ..................................................................................................................................................e..... + str q2, [x1, #-16] // .................................................................................................................................................e...... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ // original source code - // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // ...................e............................................................................................................................|..................e............. - // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // ..................................e.............................................................................................................|................................ - // ldr q0, [x5], #(12*16) // ............................e...................................................................................................................|...........................e.... - // ldr q4, [x5, #(-12*16 + 1*16)] // ......................e.........................................................................................................................|.....................e.......... - // ldr q1, [x5, #(-12*16 + 2*16)] // ...............e................................................................................................................................|..............e................. - // ldr q5, [x5, #(-12*16 + 3*16)] // .....................e..........................................................................................................................|....................e........... - // ldr q2, [x5, #(-12*16 + 4*16)] // ....................e...........................................................................................................................|...................e............ - // ldr q6, [x5, #(-12*16 + 5*16)] // .............................e..................................................................................................................|............................e... - // sub v24.4s, v9.4s, v10.4s // ...................................e............................................................................................................|................................ - // add v9.4s, v9.4s, v10.4s // ....................................e...........................................................................................................|................................ - // mul v10.4s, v24.4s, v1.4s // ............................................e...................................................................................................|................................ - // sqrdmulh v24.4s, v24.4s, v5.4s // .............................................e..................................................................................................|................................ - // mls v10.4s, v24.4s, v8.s[0] // ...................................................e............................................................................................|................................ - // sub v24.4s, v11.4s, v12.4s // .....................................e..........................................................................................................|................................ - // add v11.4s, v11.4s, v12.4s // ......................................e.........................................................................................................|................................ - // mul v12.4s, v24.4s, v2.4s // ..........................................e.....................................................................................................|................................ - // sqrdmulh v24.4s, v24.4s, v6.4s // ...........................................e....................................................................................................|................................ - // mls v12.4s, v24.4s, v8.s[0] // ................................................e...............................................................................................|................................ - // sub v24.4s, v9.4s, v11.4s // ........................................e.......................................................................................................|................................ - // add v9.4s, v9.4s, v11.4s // .........................................e......................................................................................................|................................ - // mul v11.4s, v24.4s, v0.4s // ..............................................e.................................................................................................|................................ - // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................................e................................................................................................|................................ - // mls v11.4s, v24.4s, v8.s[0] // .....................................................e..........................................................................................|................................ - // sub v24.4s, v10.4s, v12.4s // .............................................................e..................................................................................|................................ - // add v10.4s, v10.4s, v12.4s // ..............................................................e.................................................................................|................................ - // mul v12.4s, v24.4s, v0.4s // ...............................................................e................................................................................|................................ - // sqrdmulh v24.4s, v24.4s, v4.4s // ................................................................e...............................................................................|................................ - // mls v12.4s, v24.4s, v8.s[0] // .......................................................................e........................................................................|................................ - // ldr q0, [x5, #(-12*16 + 6*16)] // ......e.........................................................................................................................................|.....e.......................... - // ldr q4, [x5, #(-12*16 + 7*16)] // .......e........................................................................................................................................|......e......................... - // ldr q1, [x5, #(-12*16 + 8*16)] // .......................................e........................................................................................................|................................ - // ldr q5, [x5, #(-12*16 + 9*16)] // ........e.......................................................................................................................................|.......e........................ - // ldr q2, [x5, #(-12*16 + 10*16)] // .........................e......................................................................................................................|........................e....... - // ldr q6, [x5, #(-12*16 + 11*16)] // .........e......................................................................................................................................|........e....................... - // sub v24.4s, v13.4s, v14.4s // ..................................................e.............................................................................................|................................ - // add v13.4s, v13.4s, v14.4s // ......................................................e.........................................................................................|................................ - // mul v14.4s, v24.4s, v1.4s // ..........................................................e.....................................................................................|................................ - // sqrdmulh v24.4s, v24.4s, v5.4s // .........................................................e......................................................................................|................................ - // mls v14.4s, v24.4s, v8.s[0] // .................................................................e..............................................................................|................................ - // sub v24.4s, v15.4s, v16.4s // .................................................e..............................................................................................|................................ - // add v15.4s, v15.4s, v16.4s // ....................................................e...........................................................................................|................................ - // mul v16.4s, v24.4s, v2.4s // .......................................................e........................................................................................|................................ - // sqrdmulh v24.4s, v24.4s, v6.4s // ........................................................e.......................................................................................|................................ - // mls v16.4s, v24.4s, v8.s[0] // ..................................................................e.............................................................................|................................ - // sub v24.4s, v13.4s, v15.4s // ...........................................................e....................................................................................|................................ - // add v13.4s, v13.4s, v15.4s // ............................................................e...................................................................................|................................ - // mul v15.4s, v24.4s, v0.4s // ....................................................................e...........................................................................|................................ - // sqrdmulh v24.4s, v24.4s, v4.4s // ........................................................................e.......................................................................|................................ - // mls v15.4s, v24.4s, v8.s[0] // ................................................................................e...............................................................|................................ - // sub v24.4s, v14.4s, v16.4s // ......................................................................e.........................................................................|................................ - // add v14.4s, v14.4s, v16.4s // .....................................................................e..........................................................................|................................ - // mul v16.4s, v24.4s, v0.4s // ............................................................................e...................................................................|................................ - // sqrdmulh v24.4s, v24.4s, v4.4s // .............................................................................e..................................................................|................................ - // mls v16.4s, v24.4s, v8.s[0] // .................................................................................e..............................................................|................................ - // trn1 v25.4s, v9.4s, v10.4s // ...................................................................e............................................................................|................................ - // trn2 v26.4s, v9.4s, v10.4s // .........................................................................e......................................................................|................................ - // trn1 v27.4s, v11.4s, v12.4s // ...............................................................................e................................................................|................................ - // trn2 v28.4s, v11.4s, v12.4s // ..............................................................................e.................................................................|................................ - // trn2 v11.2d, v25.2d, v27.2d // ..................................................................................e.............................................................|................................ - // trn2 v12.2d, v26.2d, v28.2d // ....................................................................................e...........................................................|................................ - // trn1 v9.2d, v25.2d, v27.2d // ...................................................................................e............................................................|................................ - // trn1 v10.2d, v26.2d, v28.2d // .....................................................................................e..........................................................|................................ - // trn1 v25.4s, v13.4s, v14.4s // ..........................................................................e.....................................................................|................................ - // trn2 v26.4s, v13.4s, v14.4s // ...........................................................................e....................................................................|................................ - // trn1 v27.4s, v15.4s, v16.4s // ...........................................................................................e....................................................|................................ - // trn2 v28.4s, v15.4s, v16.4s // ........................................................................................e.......................................................|................................ - // trn2 v15.2d, v25.2d, v27.2d // ................................................................................................e...............................................|................................ - // trn2 v16.2d, v26.2d, v28.2d // ..............................................................................................e.................................................|................................ - // trn1 v13.2d, v25.2d, v27.2d // .................................................................................................e..............................................|................................ - // trn1 v14.2d, v26.2d, v28.2d // ...............................................................................................e................................................|................................ - // ldr q0, [x4], #64 // .......................e........................................................................................................................|......................e......... - // ldr q1, [x4, #(-64 + 16)] // .....e..........................................................................................................................................|....e........................... - // ldr q2, [x4, #(-64 + 32)] // .................................e..............................................................................................................|................................ - // ldr q3, [x4, #(-64 + 48)] // e...............................................................................................................................................e................................ - // sub v24.4s, v9.4s, v10.4s // ..........................................................................................e.....................................................|................................ - // add v9.4s, v9.4s, v10.4s // .........................................................................................e......................................................|................................ - // mul v10.4s, v24.4s, v1.s[2] // ...................................................................................................e............................................|................................ - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..................................................................................................e.............................................|................................ - // mls v10.4s, v24.4s, v8.s[0] // ..............................................................................................................e.................................|................................ - // sub v24.4s, v11.4s, v12.4s // ......................................................................................e.........................................................|................................ - // add v11.4s, v11.4s, v12.4s // .......................................................................................e........................................................|................................ - // mul v12.4s, v24.4s, v2.s[0] // ............................................................................................e...................................................|................................ - // sqrdmulh v24.4s, v24.4s, v2.s[1] // .............................................................................................e..................................................|................................ - // mls v12.4s, v24.4s, v8.s[0] // .........................................................................................................e......................................|................................ - // sub v24.4s, v13.4s, v14.4s // ....................................................................................................e...........................................|................................ - // add v13.4s, v13.4s, v14.4s // ......................................................................................................e.........................................|................................ - // mul v14.4s, v24.4s, v2.s[2] // ..........................................................................................................e.....................................|................................ - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...........................................................................................................e....................................|................................ - // mls v14.4s, v24.4s, v8.s[0] // ...................................................................................................................e............................|................................ - // sub v24.4s, v15.4s, v16.4s // .......................................................................................................e........................................|................................ - // add v15.4s, v15.4s, v16.4s // ........................................................................................................e.......................................|................................ - // mul v16.4s, v24.4s, v3.s[0] // .............................................................................................................e..................................|................................ - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ............................................................................................................e...................................|................................ - // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................................................e.........................|................................ - // sub v24.4s, v9.4s, v11.4s // .....................................................................................................e..........................................|................................ - // add v9.4s, v9.4s, v11.4s // .................................................................................................................e..............................|................................ - // mul v11.4s, v24.4s, v0.s[2] // ..................................................................................................................e.............................|................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...............................................................................................................e................................|................................ - // mls v11.4s, v24.4s, v8.s[0] // .........................................................................................................................e......................|................................ - // sub v24.4s, v10.4s, v12.4s // .......................................................................................................................e........................|................................ - // add v10.4s, v10.4s, v12.4s // ........................................................................................................................e.......................|................................ - // mul v12.4s, v24.4s, v0.s[2] // ................................................................................................................................e...............|................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .................................................................................................................................e..............|................................ - // mls v12.4s, v24.4s, v8.s[0] // ........................................................................................................................................e.......|................................ - // sub v24.4s, v13.4s, v15.4s // ..........................................................................................................................e.....................|................................ - // add v13.4s, v13.4s, v15.4s // ................................................................................................................e...............................|................................ - // mul v15.4s, v24.4s, v1.s[0] // ..................................................................................................................................e.............|................................ - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................................................e............|................................ - // mls v15.4s, v24.4s, v8.s[0] // .............................................................................................................................................e..|................................ - // sub v24.4s, v14.4s, v16.4s // ..............................................................................................................................e.................|................................ - // add v14.4s, v14.4s, v16.4s // ...............................................................................................................................e................|................................ - // mul v16.4s, v24.4s, v1.s[0] // .....................................................................................................................................e..........|................................ - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ....................................................................................................................................e...........|................................ - // mls v16.4s, v24.4s, v8.s[0] // ............................................................................................................................................e...|................................ - // sub v24.4s, v9.4s, v13.4s // ....................................................................................................................e...........................|................................ - // add v9.4s, v9.4s, v13.4s // .....................................................................................................................e..........................|................................ - // mul v13.4s, v24.4s, v0.s[0] // ............................................................................................................................e...................|................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................................................................................e....................|................................ - // mls v13.4s, v24.4s, v8.s[0] // .........................................................................................................................................e......|................................ - // sub v24.4s, v10.4s, v14.4s // ......................................................................................................................................e.........|................................ - // add v10.4s, v10.4s, v14.4s // .......................................................................................................................................e........|................................ - // mul v14.4s, v24.4s, v0.s[0] // ..........................................................................................................................................e.....|................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................................................................................................e....|................................ - // mls v14.4s, v24.4s, v8.s[0] // ...............................................................................................................................................e|................................ - // sub v24.4s, v11.4s, v15.4s // ...*............................................................................................................................................|..*............................. - // add v11.4s, v11.4s, v15.4s // ..*.............................................................................................................................................|.*.............................. - // mul v15.4s, v24.4s, v0.s[0] // .............*..................................................................................................................................|............*................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............*.................................................................................................................................|.............*.................. - // mls v15.4s, v24.4s, v8.s[0] // ..........................*.....................................................................................................................|.........................*...... - // sub v24.4s, v12.4s, v16.4s // ....*...........................................................................................................................................|...*............................ - // add v12.4s, v12.4s, v16.4s // .*..............................................................................................................................................|*............................... - // mul v16.4s, v24.4s, v0.s[0] // ............*...................................................................................................................................|...........*.................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................*..............................................................................................................................|................*............... - // mls v16.4s, v24.4s, v8.s[0] // ...........................*....................................................................................................................|..........................*..... - // str q9, [x1], #(16*4) // .............................................................................................................................e..................|................................ - // str q10, [x1, #(-16*4 + 1*16)] // ..........*.....................................................................................................................................|.........*...................... - // str q11, [x1, #(-16*4 + 2*16)] // ...........*....................................................................................................................................|..........*..................... - // str q12, [x1, #(-16*4 + 3*16)] // ................*...............................................................................................................................|...............*................ - // str q13, [x2], #(16*4) // ..............................................................................................................................................e.|................................ - // str q14, [x2, #(-16*4 + 1*16)] // ........................*.......................................................................................................................|.......................*........ - // str q15, [x2, #(-16*4 + 2*16)] // ...............................*................................................................................................................|..............................*. - // str q16, [x2, #(-16*4 + 3*16)] // ..............................*.................................................................................................................|.............................*.. - // add x1, x1, #64 // ..................*.............................................................................................................................|.................*.............. - // add x2, x2, #64 // ................................*...............................................................................................................|...............................* + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // ......e.................................................................................................................................................|.....e............. + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // .....................e..................................................................................................................................|................... + // ldr q0, [x5], #(12*16) // e.......................................................................................................................................................e................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ..................e.....................................................................................................................................|.................e. + // ldr q1, [x5, #(-12*16 + 2*16)] // .e......................................................................................................................................................|e.................. + // ldr q5, [x5, #(-12*16 + 3*16)] // ....................e...................................................................................................................................|................... + // ldr q2, [x5, #(-12*16 + 4*16)] // ........e...............................................................................................................................................|.......e........... + // ldr q6, [x5, #(-12*16 + 5*16)] // .......e................................................................................................................................................|......e............ + // sub v24.4s, v9.4s, v10.4s // ........................e...............................................................................................................................|................... + // add v9.4s, v9.4s, v10.4s // ......................e.................................................................................................................................|................... + // mul v10.4s, v24.4s, v1.4s // ..........................e.............................................................................................................................|................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .............................e..........................................................................................................................|................... + // mls v10.4s, v24.4s, v8.s[0] // .....................................e..................................................................................................................|................... + // sub v24.4s, v11.4s, v12.4s // .......................e................................................................................................................................|................... + // add v11.4s, v11.4s, v12.4s // .........................e..............................................................................................................................|................... + // mul v12.4s, v24.4s, v2.4s // ...........................e............................................................................................................................|................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ............................e...........................................................................................................................|................... + // mls v12.4s, v24.4s, v8.s[0] // ......................................e.................................................................................................................|................... + // sub v24.4s, v9.4s, v11.4s // ...............................e........................................................................................................................|................... + // add v9.4s, v9.4s, v11.4s // ..............................................e.........................................................................................................|................... + // mul v11.4s, v24.4s, v0.4s // .......................................e................................................................................................................|................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................................e........................................................................................................|................... + // mls v11.4s, v24.4s, v8.s[0] // ............................................................e...........................................................................................|................... + // sub v24.4s, v10.4s, v12.4s // ..................................................e.....................................................................................................|................... + // add v10.4s, v10.4s, v12.4s // .................................................e......................................................................................................|................... + // mul v12.4s, v24.4s, v0.4s // .........................................................e..............................................................................................|................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..........................................................e.............................................................................................|................... + // mls v12.4s, v24.4s, v8.s[0] // .................................................................e......................................................................................|................... + // ldr q0, [x5, #(-12*16 + 6*16)] // ................................e.......................................................................................................................|................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ........................................e...............................................................................................................|................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ...............e........................................................................................................................................|..............e.... + // ldr q5, [x5, #(-12*16 + 9*16)] // ............e...........................................................................................................................................|...........e....... + // ldr q2, [x5, #(-12*16 + 10*16)] // ..............................e.........................................................................................................................|................... + // ldr q6, [x5, #(-12*16 + 11*16)] // .............e..........................................................................................................................................|............e...... + // sub v24.4s, v13.4s, v14.4s // ..................................e.....................................................................................................................|................... + // add v13.4s, v13.4s, v14.4s // .................................e......................................................................................................................|................... + // mul v14.4s, v24.4s, v1.4s // ............................................e...........................................................................................................|................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .........................................e..............................................................................................................|................... + // mls v14.4s, v24.4s, v8.s[0] // ....................................................e...................................................................................................|................... + // sub v24.4s, v15.4s, v16.4s // ...................................e....................................................................................................................|................... + // add v15.4s, v15.4s, v16.4s // ....................................e...................................................................................................................|................... + // mul v16.4s, v24.4s, v2.4s // ..........................................e.............................................................................................................|................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ...........................................e............................................................................................................|................... + // mls v16.4s, v24.4s, v8.s[0] // .....................................................e..................................................................................................|................... + // sub v24.4s, v13.4s, v15.4s // ................................................e.......................................................................................................|................... + // add v13.4s, v13.4s, v15.4s // ...........................................................e............................................................................................|................... + // mul v15.4s, v24.4s, v0.4s // ......................................................e.................................................................................................|................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................................e....................................................................................................|................... + // mls v15.4s, v24.4s, v8.s[0] // ...............................................................e........................................................................................|................... + // sub v24.4s, v14.4s, v16.4s // ..............................................................e.........................................................................................|................... + // add v14.4s, v14.4s, v16.4s // ................................................................e.......................................................................................|................... + // mul v16.4s, v24.4s, v0.4s // ....................................................................e...................................................................................|................... + // sqrdmulh v24.4s, v24.4s, v4.4s // .....................................................................e..................................................................................|................... + // mls v16.4s, v24.4s, v8.s[0] // ........................................................................e...............................................................................|................... + // trn1 v25.4s, v9.4s, v10.4s // .......................................................e................................................................................................|................... + // trn2 v26.4s, v9.4s, v10.4s // ........................................................e...............................................................................................|................... + // trn1 v27.4s, v11.4s, v12.4s // ......................................................................e.................................................................................|................... + // trn2 v28.4s, v11.4s, v12.4s // .......................................................................e................................................................................|................... + // trn2 v11.2d, v25.2d, v27.2d // ...........................................................................e............................................................................|................... + // trn2 v12.2d, v26.2d, v28.2d // ............................................................................e...........................................................................|................... + // trn1 v9.2d, v25.2d, v27.2d // .........................................................................e..............................................................................|................... + // trn1 v10.2d, v26.2d, v28.2d // ..........................................................................e.............................................................................|................... + // trn1 v25.4s, v13.4s, v14.4s // ..................................................................e.....................................................................................|................... + // trn2 v26.4s, v13.4s, v14.4s // ...................................................................e....................................................................................|................... + // trn1 v27.4s, v15.4s, v16.4s // ...............................................................................e........................................................................|................... + // trn2 v28.4s, v15.4s, v16.4s // ................................................................................e.......................................................................|................... + // trn2 v15.2d, v25.2d, v27.2d // ..................................................................................e.....................................................................|................... + // trn2 v16.2d, v26.2d, v28.2d // .....................................................................................e..................................................................|................... + // trn1 v13.2d, v25.2d, v27.2d // ...................................................................................e....................................................................|................... + // trn1 v14.2d, v26.2d, v28.2d // ....................................................................................e...................................................................|................... + // ldr q0, [x4], #64 // ...........e............................................................................................................................................|..........e........ + // ldr q1, [x4, #(-64 + 16)] // .............................................e..........................................................................................................|................... + // ldr q2, [x4, #(-64 + 32)] // .............................................................................e..........................................................................|................... + // ldr q3, [x4, #(-64 + 48)] // .............................................................e..........................................................................................|................... + // sub v24.4s, v9.4s, v10.4s // ..............................................................................e.........................................................................|................... + // add v9.4s, v9.4s, v10.4s // .......................................................................................e................................................................|................... + // mul v10.4s, v24.4s, v1.s[2] // .............................................................................................e..........................................................|................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ......................................................................................e.................................................................|................... + // mls v10.4s, v24.4s, v8.s[0] // .......................................................................................................e................................................|................... + // sub v24.4s, v11.4s, v12.4s // .................................................................................e......................................................................|................... + // add v11.4s, v11.4s, v12.4s // .........................................................................................e..............................................................|................... + // mul v12.4s, v24.4s, v2.s[0] // ........................................................................................e...............................................................|................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...............................................................................................e........................................................|................... + // mls v12.4s, v24.4s, v8.s[0] // ........................................................................................................e...............................................|................... + // sub v24.4s, v13.4s, v14.4s // ..........................................................................................e.............................................................|................... + // add v13.4s, v13.4s, v14.4s // ............................................................................................e...........................................................|................... + // mul v14.4s, v24.4s, v2.s[2] // .................................................................................................e......................................................|................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................................................................................................e.......................................................|................... + // mls v14.4s, v24.4s, v8.s[0] // ............................................................................................................e...........................................|................... + // sub v24.4s, v15.4s, v16.4s // ...........................................................................................e............................................................|................... + // add v15.4s, v15.4s, v16.4s // ..............................................................................................e.........................................................|................... + // mul v16.4s, v24.4s, v3.s[0] // ..................................................................................................e.....................................................|................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...................................................................................................e....................................................|................... + // mls v16.4s, v24.4s, v8.s[0] // .............................................................................................................e..........................................|................... + // sub v24.4s, v9.4s, v11.4s // .........................................................................................................e..............................................|................... + // add v9.4s, v9.4s, v11.4s // ....................................................................................................e...................................................|................... + // mul v11.4s, v24.4s, v0.s[2] // ...............................................................................................................e........................................|................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................e................................|................... + // mls v11.4s, v24.4s, v8.s[0] // ................................................................................................................................e.......................|................... + // sub v24.4s, v10.4s, v12.4s // ................................................................................................................e.......................................|................... + // add v10.4s, v10.4s, v12.4s // .................................................................................................................e......................................|................... + // mul v12.4s, v24.4s, v0.s[2] // ........................................................................................................................e...............................|................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................................................................................e..............................|................... + // mls v12.4s, v24.4s, v8.s[0] // ...................................................................................................................................e....................|................... + // sub v24.4s, v13.4s, v15.4s // ......................................................................................................e.................................................|................... + // add v13.4s, v13.4s, v15.4s // .....................................................................................................e..................................................|................... + // mul v15.4s, v24.4s, v1.s[0] // ..................................................................................................................e.....................................|................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................e.........................................|................... + // mls v15.4s, v24.4s, v8.s[0] // ...............................................................................................................................e........................|................... + // sub v24.4s, v14.4s, v16.4s // ......................................................................................................................e.................................|................... + // add v14.4s, v14.4s, v16.4s // .....................................................................................................................e..................................|................... + // mul v16.4s, v24.4s, v1.s[0] // ............................................................................................................................e...........................|................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...........................................................................................................................e............................|................... + // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................................................................e.................|................... + // srshr v24.4S, v9.4S, #23 // ...........................................................................................................e............................................|................... + // mls v9.4s, v24.4s, v8.4s // ....................................................................................................................e...................................|................... + // srshr v24.4S, v10.4S, #23 // ..........................................................................................................................e.............................|................... + // mls v10.4s, v24.4s, v8.4s // .................................................................................................................................e......................|................... + // srshr v24.4S, v13.4S, #23 // ..........................................................................................................e.............................................|................... + // mls v13.4s, v24.4s, v8.4s // ...................................................................................................................e....................................|................... + // srshr v24.4S, v14.4S, #23 // .............................................................................................................................e..........................|................... + // mls v14.4s, v24.4s, v8.4s // ....................................................................................................................................e...................|................... + // sub v24.4s, v9.4s, v13.4s // ..............................................................................................................................e.........................|................... + // add v9.4s, v9.4s, v13.4s // ..................................................................................................................................e.....................|................... + // mul v13.4s, v24.4s, v0.s[0] // .....................................................................................................................................e..................|................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................e................|................... + // mls v13.4s, v24.4s, v8.s[0] // .............................................................................................................................................e..........|................... + // sub v24.4s, v10.4s, v14.4s // ..........................................................................................................................................e.............|................... + // add v10.4s, v10.4s, v14.4s // ............................................................................................................................................e...........|................... + // mul v14.4s, v24.4s, v0.s[0] // ..................................................................................................................................................e.....|................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................e....|................... + // mls v14.4s, v24.4s, v8.s[0] // ..........*.............................................................................................................................................|.........*......... + // sub v24.4s, v11.4s, v15.4s // .........................................................................................................................................e..............|................... + // add v11.4s, v11.4s, v15.4s // .................................................................................................................................................e......|................... + // mul v15.4s, v24.4s, v0.s[0] // ..............................................................................................................................................e.........|................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................................................................................................e........|................... + // mls v15.4s, v24.4s, v8.s[0] // ..*.....................................................................................................................................................|.*................. + // sub v24.4s, v12.4s, v16.4s // ...........................................................................................................................................e............|................... + // add v12.4s, v12.4s, v16.4s // ................................................................................................................................................e.......|................... + // mul v16.4s, v24.4s, v0.s[0] // ....................................................................................................................................................e...|................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................................................................................................................e..|................... + // mls v16.4s, v24.4s, v8.s[0] // .........*..............................................................................................................................................|........*.......... + // str q9, [x1], #(16*4) // ........................................................................................................................................e...............|................... + // str q10, [x1, #(-16*4 + 1*16)] // ....*...................................................................................................................................................|...*............... + // str q11, [x1, #(-16*4 + 2*16)] // ...*....................................................................................................................................................|..*................ + // str q12, [x1, #(-16*4 + 3*16)] // .......................................................................................................................................................e|................... + // str q13, [x2], #(16*4) // ......................................................................................................................................................e.|................... + // str q14, [x2, #(-16*4 + 1*16)] // .................*......................................................................................................................................|................*.. + // str q15, [x2, #(-16*4 + 2*16)] // ..............*.........................................................................................................................................|.............*..... + // str q16, [x2, #(-16*4 + 3*16)] // ................*.......................................................................................................................................|...............*... + // add x1, x1, #64 // .....*..................................................................................................................................................|....*.............. + // add x2, x2, #64 // ...................*....................................................................................................................................|..................* sub count, count, #1 cbnz count, layer45678_start - sub v17.4S, v19.4S, v4.4S // ..*............... - add v31.4S, v22.4S, v3.4S // *................. - sub v6.4S, v22.4S, v3.4S // ...*.............. - str q24, [x2, #-48] // ............*..... - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - str q7, [x1, #-48] // ....*............. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - mul v26.4S, v17.4S, v9.S[0] // .......*.......... - sqrdmulh v0.4S, v17.4S, v9.S[1] // ........*......... - str q31, [x1, #-16] // .........*........ - mul v22.4S, v6.4S, v9.S[0] // ......*........... - sqrdmulh v28.4S, v6.4S, v9.S[1] // ..........*....... - // gap // .................. - // gap // .................. - // gap // .................. - add v17.4S, v19.4S, v4.4S // .*................ - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - mls v26.4S, v0.4S, v8.S[0] // .............*.... - mls v22.4S, v28.4S, v8.S[0] // ..............*... - str q17, [x1, #-32] // .....*............ - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - add x1, x1, #64 // ...........*...... - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - str q26, [x2, #-32] // ................*. - str q22, [x2, #-16] // ...............*.. - add x2, x2, #64 // .................* - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. + mls v11.4S, v10.4S, v8.S[0] // *......... + mls v12.4S, v14.4S, v8.S[0] // ....*..... + mls v6.4S, v23.4S, v8.S[0] // .....*.... + str q26, [x1, #-32] // .*........ + str q24, [x1, #-48] // ..*....... + add x1, x1, #64 // ...*...... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + str q11, [x2, #-32] // ......*... + str q12, [x2, #-16] // .......*.. + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + str q6, [x2, #-48] // ........*. + add x2, x2, #64 // .........* + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... // original source code - // add v18.4S, v22.4S, v3.4S // .*................ - // add v16.4S, v19.4S, v4.4S // ..........*....... - // sub v2.4S, v19.4S, v4.4S // *................. - // sub v12.4S, v22.4S, v3.4S // ..*............... - // str q7, [x1, #-48] // ....*............. - // str q16, [x1, #-32] // .............*.... - // mul v10.4S, v12.4S, v9.S[0] // ........*......... - // mul v3.4S, v2.4S, v9.S[0] // .....*............ - // sqrdmulh v2.4S, v2.4S, v9.S[1] // ......*........... - // str q18, [x1, #-16] // .......*.......... - // sqrdmulh v15.4S, v12.4S, v9.S[1] // .........*........ - // add x1, x1, #64 // ..............*... - // str q24, [x2, #-48] // ...*.............. - // mls v3.4S, v2.4S, v8.S[0] // ...........*...... - // mls v10.4S, v15.4S, v8.S[0] // ............*..... - // str q10, [x2, #-16] // ................*. - // str q3, [x2, #-32] // ...............*.. - // add x2, x2, #64 // .................* + // mls v11.4S, v10.4S, v8.S[0] // *......... + // str q26, [x1, #-32] // ...*...... + // str q24, [x1, #-48] // ....*..... + // add x1, x1, #64 // .....*.... + // mls v12.4S, v14.4S, v8.S[0] // .*........ + // mls v6.4S, v23.4S, v8.S[0] // ..*....... + // str q11, [x2, #-32] // ......*... + // str q12, [x2, #-16] // .......*.. + // str q6, [x2, #-48] // ........*. + // add x2, x2, #64 // .........* // ----------------------------------------------------------------------------- ninv .req v25 ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 ASM_LOAD(xtmp, ninv_addr) ld1r {ninv.4s}, [xtmp] ASM_LOAD(xtmp, ninv_tw_addr) ld1r {ninv_tw.4s}, [xtmp] + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + mov count, #8 ASM_LOAD(r_ptr0, roots_l012) load_roots_123 .p2align 2 - ldr q19, [x0, #512] // .......*.................................................. - ldr q18, [x0, #384] // ..*....................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - ldr q9, [x0, #256] // *......................................................... - ldr q13, [x0, #0] // ......*................................................... - ldr q27, [x0, #128] // ....*..................................................... - ldr q15, [x0, #640] // ...*...................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - ldr q14, [x0, #768] // .....*.................................................... - ldr q12, [x0, #896] // .*........................................................ - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - sub v6.4S, v9.4S, v18.4S // .........*................................................ - add v9.4S, v9.4S, v18.4S // ........*................................................. - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - add v24.4S, v13.4S, v27.4S // .................*........................................ - sub v29.4S, v19.4S, v15.4S // ..............*........................................... - add v10.4S, v19.4S, v15.4S // ................*......................................... - sub v31.4S, v13.4S, v27.4S // ...........*.............................................. - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - mul v19.4S, v6.4S, v2.S[0] // ...............*.......................................... - sqrdmulh v18.4S, v6.4S, v2.S[1] // ............*............................................. - add v13.4S, v14.4S, v12.4S // ..........*............................................... - sub v5.4S, v14.4S, v12.4S // .............*............................................ - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - add v4.4S, v24.4S, v9.4S // ........................*................................. - sub v15.4S, v24.4S, v9.4S // ...........................*.............................. - sqrdmulh v12.4S, v29.4S, v2.S[3] // .....................*.................................... - mul v7.4S, v29.4S, v2.S[2] // .........................*................................ - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - add v22.4S, v10.4S, v13.4S // .......................*.................................. - sub v13.4S, v10.4S, v13.4S // ..........................*............................... - sqrdmulh v10.4S, v31.4S, v1.S[3] // ...................*...................................... - mul v23.4S, v31.4S, v1.S[2] // ....................*..................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - sqrdmulh v30.4S, v5.4S, v3.S[1] // ..................*....................................... - mul v9.4S, v5.4S, v3.S[0] // ......................*................................... - sqrdmulh v29.4S, v15.4S, v0.S[3] // ......................................*................... - mul v15.4S, v15.4S, v0.S[2] // .....................................*.................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - mls v7.4S, v12.4S, v8.S[0] // ..................................*....................... - sub v20.4S, v4.4S, v22.4S // .............................*............................ - add v17.4S, v4.4S, v22.4S // ....................................................*..... - sqrdmulh v27.4S, v13.4S, v1.S[1] // ...............................*.......................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - mul v6.4S, v13.4S, v1.S[0] // ................................*......................... - mls v19.4S, v18.4S, v8.S[0] // ..............................*........................... - mls v23.4S, v10.4S, v8.S[0] // ............................*............................. - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - mul v14.4S, v20.4S, v0.S[0] // ....................................*..................... - sqrdmulh v24.4S, v20.4S, v0.S[1] // ...................................*...................... - mls v9.4S, v30.4S, v8.S[0] // .................................*........................ - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - mls v6.4S, v27.4S, v8.S[0] // .......................................*.................. - sub v16.4S, v23.4S, v19.4S // .........................................*................ - add v28.4S, v23.4S, v19.4S // ........................................*................. - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - add v11.4S, v7.4S, v9.4S // ...........................................*.............. - sub v21.4S, v7.4S, v9.4S // ..........................................*............... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - mls v14.4S, v24.4S, v8.S[0] // ............................................*............. - mul v24.4S, v16.4S, v0.S[2] // ..............................................*........... - sqrdmulh v9.4S, v16.4S, v0.S[3] // ...............................................*.......... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - mls v15.4S, v29.4S, v8.S[0] // .............................................*............ - sub v12.4S, v28.4S, v11.4S // ..................................................*....... - add v30.4S, v28.4S, v11.4S // ...................................................*...... - mul v13.4S, v21.4S, v1.S[0] // ................................................*......... - sqrdmulh v22.4S, v21.4S, v1.S[1] // .................................................*........ - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - srshr v4.4S, v14.4S, #23 // .....................................................*.... - mls v24.4S, v9.4S, v8.S[0] // ......................................................*... - mul v10.4S, v12.4S, v0.S[0] // ........................................................*. - sqrdmulh v21.4S, v12.4S, v0.S[1] // .........................................................* - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - sub v28.4S, v15.4S, v6.4S // .......................................................*.. + ldr q12, [x0, #256] // ..*............................................... + ldr q19, [x0, #128] // *................................................. + ldr q5, [x0, #0] // .*................................................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + ldr q15, [x0, #512] // ............*..................................... + ldr q9, [x0, #384] // ...*.............................................. + ldr q13, [x0, #896] // ..............*................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + ldr q18, [x0, #640] // .............*.................................... + ldr q27, [x0, #768] // ...............*.................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v17.4S, v5.4S, v19.4S // .....*............................................ + add v19.4S, v5.4S, v19.4S // ....*............................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v5.4S, v12.4S, v9.4S // .......*.......................................... + add v9.4S, v12.4S, v9.4S // ......*........................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v12.4S, v15.4S, v18.4S // .................*................................ + add v15.4S, v15.4S, v18.4S // ..................*............................... + add v18.4S, v27.4S, v13.4S // ........................*......................... + sqrdmulh v28.4S, v17.4S, v1.S[3] // ......................*........................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v13.4S, v27.4S, v13.4S // .........................*........................ + mul v27.4S, v17.4S, v1.S[2] // ..........................*....................... + sqrdmulh v17.4S, v5.4S, v2.S[1] // ...................*.............................. + mul v5.4S, v5.4S, v2.S[0] // ....................*............................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v22.4S, v12.4S, v2.S[3] // .....................*............................ + mul v16.4S, v12.4S, v2.S[2] // .......................*.......................... + sub v12.4S, v15.4S, v18.4S // ............................*..................... + add v20.4S, v19.4S, v9.4S // ........*......................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v19.4S, v19.4S, v9.4S // .........*........................................ + add v9.4S, v15.4S, v18.4S // .............................*.................... + mul v15.4S, v13.4S, v3.S[0] // ..............................*................... + sqrdmulh v13.4S, v13.4S, v3.S[1] // ...............................*.................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v27.4S, v28.4S, v8.S[0] // ................................*................. + mls v5.4S, v17.4S, v8.S[0] // ...........................*...................... + mul v4.4S, v12.4S, v1.S[0] // .................................*................ + sqrdmulh v12.4S, v12.4S, v1.S[1] // ...................................*.............. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v16.4S, v22.4S, v8.S[0] // ....................................*............. + mul v17.4S, v19.4S, v0.S[2] // ..........*....................................... + sqrdmulh v19.4S, v19.4S, v0.S[3] // ...........*...................................... + sub v18.4S, v20.4S, v9.4S // ..................................*............... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v15.4S, v13.4S, v8.S[0] // ......................................*........... + add v9.4S, v20.4S, v9.4S // .....................................*............ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v13.4S, v27.4S, v5.4S // .......................................*.......... + mls v4.4S, v12.4S, v8.S[0] // .........................................*........ + sqrdmulh v7.4S, v18.4S, v0.S[1] // ........................................*......... + mul v14.4S, v18.4S, v0.S[0] // ..........................................*....... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v5.4S, v27.4S, v5.4S // .............................................*.... + mls v17.4S, v19.4S, v8.S[0] // ................*................................. + sqrdmulh v23.4S, v9.4S, v26.4S // ...........................................*...... + mul v12.4S, v9.4S, v25.4S // ............................................*..... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v22.4S, v16.4S, v15.4S // .................................................* + sub v15.4S, v16.4S, v15.4S // ................................................*. + mul v28.4S, v13.4S, v0.S[2] // ..............................................*... + sqrdmulh v24.4S, v13.4S, v0.S[3] // ...............................................*.. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. // original source code - // ldr q11, [x0, #256] // ..*....................................................... - // ldr q19, [x0, #896] // .......*.................................................. - // ldr q12, [x0, #384] // .*........................................................ - // ldr q16, [x0, #640] // .....*.................................................... - // ldr q7, [x0, #128] // ....*..................................................... - // ldr q9, [x0, #768] // ......*................................................... - // ldr q6, [x0, #0] // ...*...................................................... - // ldr q15, [x0, #512] // *......................................................... - // add v21.4S, v11.4S, v12.4S // .........*................................................ - // sub v12.4S, v11.4S, v12.4S // ........*................................................. - // add v24.4S, v9.4S, v19.4S // ................*......................................... - // sub v13.4S, v6.4S, v7.4S // .............*............................................ - // sqrdmulh v14.4S, v12.4S, v2.S[1] // ...............*.......................................... - // sub v9.4S, v9.4S, v19.4S // .................*........................................ - // sub v27.4S, v15.4S, v16.4S // ...........*.............................................. - // mul v30.4S, v12.4S, v2.S[0] // ..............*........................................... - // add v16.4S, v15.4S, v16.4S // ............*............................................. - // add v6.4S, v6.4S, v7.4S // ..........*............................................... - // sqrdmulh v12.4S, v9.4S, v3.S[1] // ..........................*............................... - // sqrdmulh v28.4S, v13.4S, v1.S[3] // ........................*................................. - // mul v7.4S, v13.4S, v1.S[2] // .........................*................................ - // sqrdmulh v15.4S, v27.4S, v2.S[3] // ....................*..................................... - // mul v13.4S, v9.4S, v3.S[0] // ...........................*.............................. - // add v19.4S, v16.4S, v24.4S // ......................*................................... - // add v9.4S, v6.4S, v21.4S // ..................*....................................... - // mul v27.4S, v27.4S, v2.S[2] // .....................*.................................... - // sub v16.4S, v16.4S, v24.4S // .......................*.................................. - // sub v17.4S, v6.4S, v21.4S // ...................*...................................... - // mls v7.4S, v28.4S, v8.S[0] // ....................................*..................... - // sub v24.4S, v9.4S, v19.4S // ...............................*.......................... - // mls v30.4S, v14.4S, v8.S[0] // ...................................*...................... - // sqrdmulh v21.4S, v16.4S, v1.S[1] // .................................*........................ - // mul v6.4S, v16.4S, v1.S[0] // ..................................*....................... - // mls v13.4S, v12.4S, v8.S[0] // .......................................*.................. - // mls v27.4S, v15.4S, v8.S[0] // ..............................*........................... - // sqrdmulh v29.4S, v24.4S, v0.S[1] // ......................................*................... - // mul v14.4S, v24.4S, v0.S[0] // .....................................*.................... - // mul v15.4S, v17.4S, v0.S[2] // .............................*............................ - // sqrdmulh v28.4S, v17.4S, v0.S[3] // ............................*............................. - // mls v6.4S, v21.4S, v8.S[0] // ........................................*................. - // add v5.4S, v7.4S, v30.4S // ..........................................*............... - // sub v20.4S, v7.4S, v30.4S // .........................................*................ - // sub v10.4S, v27.4S, v13.4S // ............................................*............. - // add v16.4S, v27.4S, v13.4S // ...........................................*.............. - // mls v14.4S, v29.4S, v8.S[0] // .............................................*............ - // mls v15.4S, v28.4S, v8.S[0] // ................................................*......... - // mul v24.4S, v20.4S, v0.S[2] // ..............................................*........... - // sqrdmulh v12.4S, v20.4S, v0.S[3] // ...............................................*.......... - // mul v13.4S, v10.4S, v1.S[0] // ...................................................*...... - // sqrdmulh v22.4S, v10.4S, v1.S[1] // ....................................................*..... - // sub v21.4S, v5.4S, v16.4S // .................................................*........ - // add v30.4S, v5.4S, v16.4S // ..................................................*....... - // add v17.4S, v9.4S, v19.4S // ................................*......................... - // srshr v4.4S, v14.4S, #23 // .....................................................*.... - // mls v24.4S, v12.4S, v8.S[0] // ......................................................*... - // sub v28.4S, v15.4S, v6.4S // .........................................................* - // mul v10.4S, v21.4S, v0.S[0] // .......................................................*.. - // sqrdmulh v21.4S, v21.4S, v0.S[1] // ........................................................*. + // ldr q16, [x0, #128] // .*................................................ + // ldr q20, [x0, #0] // ..*............................................... + // ldr q6, [x0, #256] // *................................................. + // ldr q23, [x0, #384] // ....*............................................. + // add v27.4S, v20.4S, v16.4S // .........*........................................ + // sub v24.4S, v20.4S, v16.4S // ........*......................................... + // add v22.4S, v6.4S, v23.4S // ...........*...................................... + // sub v16.4S, v6.4S, v23.4S // ..........*....................................... + // add v7.4S, v27.4S, v22.4S // .......................*.......................... + // sub v23.4S, v27.4S, v22.4S // ........................*......................... + // mul v17.4S, v23.4S, v0.S[2] // .................................*................ + // sqrdmulh v23.4S, v23.4S, v0.S[3] // ..................................*............... + // ldr q20, [x0, #512] // ...*.............................................. + // ldr q28, [x0, #640] // ......*........................................... + // ldr q22, [x0, #896] // .....*............................................ + // ldr q27, [x0, #768] // .......*.......................................... + // mls v17.4S, v23.4S, v8.S[0] // ...........................................*...... + // sub v14.4S, v20.4S, v28.4S // ............*..................................... + // add v28.4S, v20.4S, v28.4S // .............*.................................... + // sqrdmulh v11.4S, v16.4S, v2.S[1] // ..................*............................... + // mul v13.4S, v16.4S, v2.S[0] // ...................*.............................. + // sqrdmulh v16.4S, v14.4S, v2.S[3] // ....................*............................. + // sqrdmulh v21.4S, v24.4S, v1.S[3] // ...............*.................................. + // mul v19.4S, v14.4S, v2.S[2] // .....................*............................ + // add v20.4S, v27.4S, v22.4S // ..............*................................... + // sub v10.4S, v27.4S, v22.4S // ................*................................. + // mul v27.4S, v24.4S, v1.S[2] // .................*................................ + // mls v13.4S, v11.4S, v8.S[0] // .............................*.................... + // sub v23.4S, v28.4S, v20.4S // ......................*........................... + // add v20.4S, v28.4S, v20.4S // .........................*........................ + // mul v22.4S, v10.4S, v3.S[0] // ..........................*....................... + // sqrdmulh v10.4S, v10.4S, v3.S[1] // ...........................*...................... + // mls v27.4S, v21.4S, v8.S[0] // ............................*..................... + // mul v4.4S, v23.4S, v1.S[0] // ..............................*................... + // sub v11.4S, v7.4S, v20.4S // ...................................*.............. + // sqrdmulh v14.4S, v23.4S, v1.S[1] // ...............................*.................. + // mls v19.4S, v16.4S, v8.S[0] // ................................*................. + // add v24.4S, v7.4S, v20.4S // .....................................*............ + // mls v22.4S, v10.4S, v8.S[0] // ....................................*............. + // sub v10.4S, v27.4S, v13.4S // ......................................*........... + // sqrdmulh v7.4S, v11.4S, v0.S[1] // ........................................*......... + // mls v4.4S, v14.4S, v8.S[0] // .......................................*.......... + // mul v14.4S, v11.4S, v0.S[0] // .........................................*........ + // sqrdmulh v23.4S, v24.4S, v26.4S // ............................................*..... + // mul v12.4S, v24.4S, v25.4S // .............................................*.... + // add v5.4S, v27.4S, v13.4S // ..........................................*....... + // mul v28.4S, v10.4S, v0.S[2] // ................................................*. + // sqrdmulh v24.4S, v10.4S, v0.S[3] // .................................................* + // sub v15.4S, v19.4S, v22.4S // ...............................................*.. + // add v22.4S, v19.4S, v22.4S // ..............................................*... sub count, count, #1 layer123_start: - ldr q11, [x0, #272] // ..e............................................................................................. - ldr q19, [x0, #912] // .......e........................................................................................ - ldr q12, [x0, #400] // ...e............................................................................................ - mls v13.4S, v22.4S, v8.S[0] // ...............................................*................................................ - sqrdmulh v29.4S, v17.4S, v26.4S // .................................................................................*.............. - mul v23.4S, v17.4S, v25.4S // ................................................................................*............... - // gap // ................................................................................................ - sqrdmulh v22.4S, v30.4S, v26.4S // ....................................................................................*........... - ldr q16, [x0, #656] // .....e.......................................................................................... - ldr q7, [x0, #144] // .e.............................................................................................. - mul v18.4S, v28.4S, v0.S[0] // ............................................................*................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v14.4S, v4.4S, v8.4S // .....................................................................*.......................... - add v20.4S, v15.4S, v6.4S // ...........................................................*.................................... - mul v27.4S, v30.4S, v25.4S // ...................................................................................*............ - ldr q9, [x0, #784] // ......e......................................................................................... - ldr q6, [x0, #16] // e............................................................................................... - sqrdmulh v17.4S, v28.4S, v0.S[1] // .............................................................*.................................. - mls v10.4S, v21.4S, v8.S[0] // .........................................................*...................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - ldr q15, [x0, #528] // ....e........................................................................................... - mul v31.4S, v20.4S, v25.4S // ......................................................................................*......... - add v5.4S, v24.4S, v13.4S // ................................................................*............................... - sqrdmulh v20.4S, v20.4S, v26.4S // .......................................................................................*........ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sub v28.4S, v24.4S, v13.4S // ...............................................................*................................ - str q14, [x0, #512] // ............................................................................*................... - add v21.4S, v11.4S, v12.4S // ..............e................................................................................. - sub v12.4S, v11.4S, v12.4S // .............e.................................................................................. - mls v27.4S, v22.4S, v8.S[0] // .....................................................................................*.......... - mls v23.4S, v29.4S, v8.S[0] // ..................................................................................*............. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v22.4S, v28.4S, v0.S[0] // .................................................................*.............................. - // gap // ................................................................................................ - sqrdmulh v4.4S, v5.4S, v26.4S // ..........................................................................................*..... - srshr v29.4S, v10.4S, #23 // ......................................................................*......................... - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v11.4S, v28.4S, v0.S[1] // ..................................................................*............................. - // gap // ................................................................................................ - add v24.4S, v9.4S, v19.4S // ........................e....................................................................... - // gap // ................................................................................................ - sub v13.4S, v6.4S, v7.4S // ........e....................................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v14.4S, v12.4S, v2.S[1] // ................e............................................................................... - // gap // ................................................................................................ - sub v9.4S, v9.4S, v19.4S // .......................e........................................................................ - str q27, [x0, #128] // .............................................................................................*.. - sub v27.4S, v15.4S, v16.4S // ..................e............................................................................. - mul v30.4S, v12.4S, v2.S[0] // ...............e................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - add v16.4S, v15.4S, v16.4S // ...................e............................................................................ - add v6.4S, v6.4S, v7.4S // .........e...................................................................................... - sqrdmulh v12.4S, v9.4S, v3.S[1] // ..........................e..................................................................... - mls v22.4S, v11.4S, v8.S[0] // ...................................................................*............................ - sqrdmulh v28.4S, v13.4S, v1.S[3] // ...........e.................................................................................... - mul v7.4S, v13.4S, v1.S[2] // ..........e..................................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v15.4S, v27.4S, v2.S[3] // .....................e.......................................................................... - mul v13.4S, v9.4S, v3.S[0] // .........................e...................................................................... - add v19.4S, v16.4S, v24.4S // .......................................e........................................................ - add v9.4S, v6.4S, v21.4S // .............................e.................................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v27.4S, v27.4S, v2.S[2] // ....................e........................................................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v18.4S, v17.4S, v8.S[0] // ..............................................................*................................. - sub v16.4S, v16.4S, v24.4S // ......................................e......................................................... - sub v17.4S, v6.4S, v21.4S // ............................e................................................................... - mls v7.4S, v28.4S, v8.S[0] // ............e................................................................................... - sub v24.4S, v9.4S, v19.4S // ................................................e............................................... - srshr v11.4S, v22.4S, #23 // ..........................................................................*..................... - mls v30.4S, v14.4S, v8.S[0] // .................e.............................................................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v21.4S, v16.4S, v1.S[1] // .........................................e...................................................... - mul v6.4S, v16.4S, v1.S[0] // ........................................e....................................................... - mls v13.4S, v12.4S, v8.S[0] // ...........................e.................................................................... - mls v10.4S, v29.4S, v8.4S // .......................................................................*........................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v27.4S, v15.4S, v8.S[0] // ......................e......................................................................... - sqrdmulh v29.4S, v24.4S, v0.S[1] // ...................................................e............................................ - mul v14.4S, v24.4S, v0.S[0] // ..................................................e............................................. - str q23, [x0], #(16) // ............................................................................................*... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - srshr v12.4S, v18.4S, #23 // ........................................................................*....................... - mls v22.4S, v11.4S, v8.4S // ...........................................................................*.................... - mul v15.4S, v17.4S, v0.S[2] // ..............................e................................................................. - mul v11.4S, v5.4S, v25.4S // .........................................................................................*...... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - sqrdmulh v28.4S, v17.4S, v0.S[3] // ...............................e................................................................ - // gap // ................................................................................................ - str q10, [x0, #624] // .............................................................................*.................. - mls v6.4S, v21.4S, v8.S[0] // ..........................................e..................................................... - mls v31.4S, v20.4S, v8.S[0] // ........................................................................................*....... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - add v5.4S, v7.4S, v30.4S // ..................................e............................................................. - sub v20.4S, v7.4S, v30.4S // .................................e.............................................................. - sub v10.4S, v27.4S, v13.4S // ...........................................e.................................................... - add v16.4S, v27.4S, v13.4S // ............................................e................................................... - mls v14.4S, v29.4S, v8.S[0] // ....................................................e........................................... - mls v18.4S, v12.4S, v8.4S // .........................................................................*...................... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mls v15.4S, v28.4S, v8.S[0] // ................................e............................................................... - mul v24.4S, v20.4S, v0.S[2] // ...................................e............................................................ - sqrdmulh v12.4S, v20.4S, v0.S[3] // ....................................e........................................................... - str q22, [x0, #880] // ...............................................................................*................ - mls v11.4S, v4.4S, v8.S[0] // ...........................................................................................*.... - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - mul v13.4S, v10.4S, v1.S[0] // .............................................e.................................................. - sqrdmulh v22.4S, v10.4S, v1.S[1] // ..............................................e................................................. - sub v21.4S, v5.4S, v16.4S // .....................................................e.......................................... - add v30.4S, v5.4S, v16.4S // ......................................................e......................................... - // gap // ................................................................................................ - str q31, [x0, #240] // ..............................................................................................*. - // gap // ................................................................................................ - // gap // ................................................................................................ - str q18, [x0, #752] // ..............................................................................*................. - // gap // ................................................................................................ - // gap // ................................................................................................ - // gap // ................................................................................................ - add v17.4S, v9.4S, v19.4S // .................................................e.............................................. - // gap // ................................................................................................ - // gap // ................................................................................................ - srshr v4.4S, v14.4S, #23 // ....................................................................e........................... - mls v24.4S, v12.4S, v8.S[0] // .....................................e.......................................................... - sub v28.4S, v15.4S, v6.4S // ..........................................................e..................................... - // gap // ................................................................................................ - mul v10.4S, v21.4S, v0.S[0] // .......................................................e........................................ - sqrdmulh v21.4S, v21.4S, v0.S[1] // ........................................................e....................................... - // gap // ................................................................................................ - // gap // ................................................................................................ - str q11, [x0, #368] // ...............................................................................................* + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q16, [x0, #144] // .e...................................................................................................................... + ldr q20, [x0, #16] // e....................................................................................................................... + ldr q6, [x0, #272] // ..e..................................................................................................................... + add v19.4S, v17.4S, v4.4S // ...........................................................*............................................................ + mls v14.4S, v7.4S, v8.S[0] // ....................................................*................................................................... + mls v12.4S, v23.4S, v8.S[0] // ..........................................................................................*............................. + ldr q23, [x0, #400] // ...e.................................................................................................................... + sub v27.4S, v17.4S, v4.4S // ..........................................................*............................................................. + mul v4.4S, v15.4S, v1.S[0] // .............................................*.......................................................................... + sqrdmulh v18.4S, v15.4S, v1.S[1] // ..............................................*......................................................................... + // gap // ........................................................................................................................ + sub v15.4S, v5.4S, v22.4S // .....................................................*.................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v28.4S, v24.4S, v8.S[0] // .....................................*.................................................................................. + // gap // ........................................................................................................................ + add v11.4S, v5.4S, v22.4S // ......................................................*................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v13.4S, v19.4S, v25.4S // ..............................................................................................*......................... + sqrdmulh v10.4S, v19.4S, v26.4S // ...............................................................................................*........................ + mul v5.4S, v27.4S, v0.S[0] // ............................................................*........................................................... + // gap // ........................................................................................................................ + sqrdmulh v21.4S, v27.4S, v0.S[1] // .............................................................*.......................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v9.4S, v15.4S, v0.S[1] // ........................................................*............................................................... + // gap // ........................................................................................................................ + mul v19.4S, v15.4S, v0.S[0] // .......................................................*................................................................ + sqrdmulh v17.4S, v11.4S, v26.4S // ............................................................................................*........................... + mls v4.4S, v18.4S, v8.S[0] // ...............................................*........................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v27.4S, v20.4S, v16.4S // .........e.............................................................................................................. + // gap // ........................................................................................................................ + sub v24.4S, v20.4S, v16.4S // ........e............................................................................................................... + mul v15.4S, v11.4S, v25.4S // ...........................................................................................*............................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v22.4S, v6.4S, v23.4S // ..............e......................................................................................................... + // gap // ........................................................................................................................ + sub v16.4S, v6.4S, v23.4S // .............e.......................................................................................................... + mls v13.4S, v10.4S, v8.S[0] // ................................................................................................*....................... + mls v5.4S, v21.4S, v8.S[0] // ..............................................................*......................................................... + cmge v21.4S, v31.4S, v14.4S // ....................................................................*................................................... + cmge v10.4S, v14.4S, v30.4S // .....................................................................*.................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v19.4S, v9.4S, v8.S[0] // .........................................................*.............................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v7.4S, v27.4S, v22.4S // .............................e.......................................................................................... + sub v23.4S, v27.4S, v22.4S // ............................e........................................................................................... + add v27.4S, v28.4S, v4.4S // ................................................................*....................................................... + sub v9.4S, v28.4S, v4.4S // ...............................................................*........................................................ + // gap // ........................................................................................................................ + mls v15.4S, v17.4S, v8.S[0] // .............................................................................................*.......................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v22.4S, v31.4S, v13.4S // ............................................................................................................*........... + sub v10.4S, v21.4S, v10.4S // ......................................................................*................................................. + cmge v17.4S, v13.4S, v30.4S // .............................................................................................................*.......... + cmge v21.4S, v31.4S, v5.4S // ............................................................................*........................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v4.4S, v5.4S, v30.4S // .............................................................................*.......................................... + // gap // ........................................................................................................................ + cmge v6.4S, v31.4S, v19.4S // ........................................................................*............................................... + // gap // ........................................................................................................................ + cmge v18.4S, v19.4S, v30.4S // .........................................................................*.............................................. + sub v22.4S, v22.4S, v17.4S // ..............................................................................................................*......... + mul v17.4S, v23.4S, v0.S[2] // ..............................e......................................................................................... + sqrdmulh v23.4S, v23.4S, v0.S[3] // ...............................e........................................................................................ + ldr q20, [x0, #528] // ....e................................................................................................................... + ldr q28, [x0, #656] // .....e.................................................................................................................. + // gap // ........................................................................................................................ + sqrdmulh v11.4S, v9.4S, v0.S[1] // ..................................................................*..................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v14.4S, v10.4S, v29.4S // .......................................................................*................................................ + // gap // ........................................................................................................................ + sub v10.4S, v6.4S, v18.4S // ..........................................................................*............................................. + // gap // ........................................................................................................................ + sub v18.4S, v21.4S, v4.4S // ..............................................................................*......................................... + sqrdmulh v4.4S, v27.4S, v26.4S // ..................................................................................................*..................... + // gap // ........................................................................................................................ + cmge v6.4S, v31.4S, v15.4S // ........................................................................................................*............... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v21.4S, v31.4S, v12.4S // ....................................................................................................*................... + // gap // ........................................................................................................................ + mul v9.4S, v9.4S, v0.S[0] // .................................................................*...................................................... + mls v13.4S, v22.4S, v29.4S // ...............................................................................................................*........ + ldr q22, [x0, #912] // .......e................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v19.4S, v10.4S, v29.4S // ...........................................................................*............................................ + mls v5.4S, v18.4S, v29.4S // ...............................................................................*........................................ + mul v18.4S, v27.4S, v25.4S // .................................................................................................*...................... + ldr q27, [x0, #784] // ......e................................................................................................................. + cmge v10.4S, v12.4S, v30.4S // .....................................................................................................*.................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v17.4S, v23.4S, v8.S[0] // ................................e....................................................................................... + // gap // ........................................................................................................................ + cmge v23.4S, v15.4S, v30.4S // .........................................................................................................*.............. + str q14, [x0, #512] // ....................................................................................*................................... + sub v14.4S, v20.4S, v28.4S // ..................e..................................................................................................... + add v28.4S, v20.4S, v28.4S // ...................e.................................................................................................... + mls v9.4S, v11.4S, v8.S[0] // ...................................................................*.................................................... + sqrdmulh v11.4S, v16.4S, v2.S[1] // ................e....................................................................................................... + // gap // ........................................................................................................................ + sub v10.4S, v21.4S, v10.4S // ......................................................................................................*................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q13, [x0, #256] // ......................................................................................................................*. + mul v13.4S, v16.4S, v2.S[0] // ...............e........................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q19, [x0, #640] // .....................................................................................*.................................. + sqrdmulh v16.4S, v14.4S, v2.S[3] // .....................e.................................................................................................. + mls v18.4S, v4.4S, v8.S[0] // ...................................................................................................*.................... + sqrdmulh v21.4S, v24.4S, v1.S[3] // ...........e............................................................................................................ + mul v19.4S, v14.4S, v2.S[2] // ....................e................................................................................................... + str q5, [x0, #768] // ......................................................................................*................................. + add v20.4S, v27.4S, v22.4S // ........................e............................................................................................... + // gap // ........................................................................................................................ + mls v12.4S, v10.4S, v29.4S // .......................................................................................................*................ + sub v10.4S, v27.4S, v22.4S // .......................e................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v27.4S, v24.4S, v1.S[2] // ..........e............................................................................................................. + sub v5.4S, v6.4S, v23.4S // ..........................................................................................................*............. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v13.4S, v11.4S, v8.S[0] // .................e...................................................................................................... + cmge v6.4S, v9.4S, v30.4S // .................................................................................*...................................... + cmge v22.4S, v31.4S, v9.4S // ................................................................................*....................................... + cmge v14.4S, v18.4S, v30.4S // .................................................................................................................*...... + cmge v4.4S, v31.4S, v18.4S // ................................................................................................................*....... + // gap // ........................................................................................................................ + sub v23.4S, v28.4S, v20.4S // ......................................e................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v20.4S, v28.4S, v20.4S // .......................................e................................................................................ + sub v6.4S, v22.4S, v6.4S // ..................................................................................*..................................... + // gap // ........................................................................................................................ + str q12, [x0], #(16) // ....................................................................................................................*... + mul v22.4S, v10.4S, v3.S[0] // .........................e.............................................................................................. + sqrdmulh v10.4S, v10.4S, v3.S[1] // ..........................e............................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v27.4S, v21.4S, v8.S[0] // ............e........................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v12.4S, v4.4S, v14.4S // ..................................................................................................................*..... + mul v4.4S, v23.4S, v1.S[0] // ........................................e............................................................................... + sub v11.4S, v7.4S, v20.4S // ................................................e....................................................................... + sqrdmulh v14.4S, v23.4S, v1.S[1] // .........................................e.............................................................................. + // gap // ........................................................................................................................ + mls v15.4S, v5.4S, v29.4S // ...........................................................................................................*............ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v9.4S, v6.4S, v29.4S // ...................................................................................*.................................... + mls v19.4S, v16.4S, v8.S[0] // ......................e................................................................................................. + add v24.4S, v7.4S, v20.4S // .................................................e...................................................................... + // gap // ........................................................................................................................ + mls v18.4S, v12.4S, v29.4S // ...................................................................................................................*.... + mls v22.4S, v10.4S, v8.S[0] // ...........................e............................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v10.4S, v27.4S, v13.4S // .................................e...................................................................................... + sqrdmulh v7.4S, v11.4S, v0.S[1] // ...................................................e.................................................................... + // gap // ........................................................................................................................ + mls v4.4S, v14.4S, v8.S[0] // ..........................................e............................................................................. + mul v14.4S, v11.4S, v0.S[0] // ..................................................e..................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v23.4S, v24.4S, v26.4S // .........................................................................................e.............................. + mul v12.4S, v24.4S, v25.4S // ........................................................................................e............................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v5.4S, v27.4S, v13.4S // ..................................e..................................................................................... + str q15, [x0, #112] // .....................................................................................................................*.. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v28.4S, v10.4S, v0.S[2] // ...................................e.................................................................................... + sqrdmulh v24.4S, v10.4S, v0.S[3] // ....................................e................................................................................... + str q18, [x0, #368] // .......................................................................................................................* + // gap // ........................................................................................................................ + str q9, [x0, #880] // .......................................................................................*................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v15.4S, v19.4S, v22.4S // ...........................................e............................................................................ + add v22.4S, v19.4S, v22.4S // ............................................e........................................................................... // original source code - // ldr q9, [x0, #0] // ..............e.................................................................................|.............e................................................................................. - // ldr q10, [x0, #(1*(1024/8))] // ........e.......................................................................................|.......e....................................................................................... - // ldr q11, [x0, #(2*(1024/8))] // e...............................................................................................e............................................................................................... - // ldr q12, [x0, #(3*(1024/8))] // ..e.............................................................................................|.e............................................................................................. - // ldr q13, [x0, #(4*(1024/8))] // .................e..............................................................................|................e.............................................................................. - // ldr q14, [x0, #(5*(1024/8))] // .......e........................................................................................|......e........................................................................................ - // ldr q15, [x0, #(6*(1024/8))] // .............e..................................................................................|............e.................................................................................. - // ldr q16, [x0, #(7*(1024/8))] // .e..............................................................................................|e.............................................................................................. - // sub v24.4s, v9.4s, v10.4s // ................................e...............................................................|...............................e............................................................... - // add v9.4s, v9.4s, v10.4s // .......................................e........................................................|......................................e........................................................ - // mul v10.4s, v24.4s, v1.s[2] // ...........................................e....................................................|..........................................e.................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..........................................e.....................................................|.........................................e..................................................... - // mls v10.4s, v24.4s, v8.s[0] // ....................................................e...........................................|...................................................e........................................... - // sub v24.4s, v11.4s, v12.4s // ........................e.......................................................................|.......................e....................................................................... - // add v11.4s, v11.4s, v12.4s // .......................e........................................................................|......................e........................................................................ - // mul v12.4s, v24.4s, v2.s[0] // .....................................e..........................................................|....................................e.......................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // .................................e..............................................................|................................e.............................................................. - // mls v12.4s, v24.4s, v8.s[0] // .......................................................e........................................|......................................................e........................................ - // sub v24.4s, v13.4s, v14.4s // ....................................e...........................................................|...................................e........................................................... - // add v13.4s, v13.4s, v14.4s // ......................................e.........................................................|.....................................e......................................................... - // mul v14.4s, v24.4s, v2.s[2] // ................................................e...............................................|...............................................e............................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ............................................e...................................................|...........................................e................................................... - // mls v14.4s, v24.4s, v8.s[0] // ............................................................e...................................|...........................................................e................................... - // sub v24.4s, v15.4s, v16.4s // ..................................e.............................................................|.................................e............................................................. - // add v15.4s, v15.4s, v16.4s // ...............................e................................................................|..............................e................................................................ - // mul v16.4s, v24.4s, v3.s[0] // .............................................e..................................................|............................................e.................................................. - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................e.......................................................|.......................................e....................................................... - // mls v16.4s, v24.4s, v8.s[0] // ..........................................................e.....................................|.........................................................e..................................... - // sub v24.4s, v9.4s, v11.4s // ...................................................e............................................|..................................................e............................................ - // add v9.4s, v9.4s, v11.4s // ...............................................e................................................|..............................................e................................................ - // mul v11.4s, v24.4s, v0.s[2] // ..................................................................e.............................|.................................................................e............................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ....................................................................e...........................|...................................................................e........................... - // mls v11.4s, v24.4s, v8.s[0] // ..............................................................................e.................|.............................................................................e................. - // sub v24.4s, v10.4s, v12.4s // .........................................................................e......................|........................................................................e...................... - // add v10.4s, v10.4s, v12.4s // ........................................................................e.......................|.......................................................................e....................... - // mul v12.4s, v24.4s, v0.s[2] // ...............................................................................e................|..............................................................................e................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................................................................................e...............|...............................................................................e............... - // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................................e....|..........................................................................................e.... - // sub v24.4s, v13.4s, v15.4s // ..................................................e.............................................|.................................................e............................................. - // add v13.4s, v13.4s, v15.4s // ..............................................e.................................................|.............................................e................................................. - // mul v15.4s, v24.4s, v1.s[0] // .........................................................e......................................|........................................................e...................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................................................e.......................................|.......................................................e....................................... - // mls v15.4s, v24.4s, v8.s[0] // ......................................................................e.........................|.....................................................................e......................... - // sub v24.4s, v14.4s, v16.4s // ..........................................................................e.....................|.........................................................................e..................... - // add v14.4s, v14.4s, v16.4s // ...........................................................................e....................|..........................................................................e.................... - // mul v16.4s, v24.4s, v1.s[0] // ...................................................................................e............|..................................................................................e............ - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ....................................................................................e...........|...................................................................................e........... - // mls v16.4s, v24.4s, v8.s[0] // ...*............................................................................................|..*............................................................................................ - // sub v24.4s, v9.4s, v13.4s // .....................................................e..........................................|....................................................e.......................................... - // add v9.4s, v9.4s, v13.4s // .........................................................................................e......|........................................................................................e...... - // mul v13.4s, v24.4s, v0.s[0] // ..............................................................e.................................|.............................................................e................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................e..................................|............................................................e.................................. - // mls v13.4s, v24.4s, v8.s[0] // ............................................................................e...................|...........................................................................e................... - // sub v24.4s, v10.4s, v14.4s // .....................................................................................e..........|....................................................................................e.......... - // add v10.4s, v10.4s, v14.4s // ......................................................................................e.........|.....................................................................................e......... - // mul v14.4s, v24.4s, v0.s[0] // .............................................................................................e..|............................................................................................e.. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................................................................e.|.............................................................................................e. - // mls v14.4s, v24.4s, v8.s[0] // ................*...............................................................................|...............*............................................................................... - // sub v24.4s, v11.4s, v15.4s // ............................................................................................e...|...........................................................................................e... - // add v11.4s, v11.4s, v15.4s // ...........*....................................................................................|..........*.................................................................................... - // mul v15.4s, v24.4s, v0.s[0] // .........*......................................................................................|........*...................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............*................................................................................|..............*................................................................................ - // mls v15.4s, v24.4s, v8.s[0] // .................................................*..............................................|................................................*.............................................. - // sub v24.4s, v12.4s, v16.4s // .....................*..........................................................................|....................*.......................................................................... - // add v12.4s, v12.4s, v16.4s // ...................*............................................................................|..................*............................................................................ - // mul v16.4s, v24.4s, v0.s[0] // ...........................*....................................................................|..........................*.................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................*.................................................................|.............................*................................................................. - // mls v16.4s, v24.4s, v8.s[0] // .........................................*......................................................|........................................*...................................................... - // srshr v24.4S, v13.4S, #23 // ..........................................................................................e.....|.........................................................................................e..... - // mls v13.4s, v24.4s, v8.4s // ..........*.....................................................................................|.........*..................................................................................... - // srshr v24.4S, v14.4S, #23 // .............................*..................................................................|............................*.................................................................. - // mls v14.4s, v24.4s, v8.4s // ...........................................................*....................................|..........................................................*.................................... - // srshr v24.4S, v15.4S, #23 // ................................................................*...............................|...............................................................*............................... - // mls v15.4s, v24.4s, v8.4s // .............................................................................*..................|............................................................................*.................. - // srshr v24.4S, v16.4S, #23 // ......................................................*.........................................|.....................................................*......................................... - // mls v16.4s, v24.4s, v8.4s // .................................................................*..............................|................................................................*.............................. - // str q13, [x0, #(4*(1024/8))] // ......................*.........................................................................|.....................*......................................................................... - // str q14, [x0, #(5*(1024/8))] // .....................................................................*..........................|....................................................................*.......................... - // str q15, [x0, #(6*(1024/8))] // ........................................................................................*.......|.......................................................................................*....... - // str q16, [x0, #(7*(1024/8))] // .................................................................................*..............|................................................................................*.............. - // mul v13.4s, v9.4s, v25.4s // .....*..........................................................................................|....*.......................................................................................... - // sqrdmulh v9.4s, v9.4s, v26.4s // ....*...........................................................................................|...*........................................................................................... - // mls v13.4s, v9.4s, v8.s[0] // ..........................*.....................................................................|.........................*..................................................................... - // mul v14.4s, v10.4s, v25.4s // ............*...................................................................................|...........*................................................................................... - // sqrdmulh v10.4s, v10.4s, v26.4s // ......*.........................................................................................|.....*......................................................................................... - // mls v14.4s, v10.4s, v8.s[0] // .........................*......................................................................|........................*...................................................................... - // mul v15.4s, v11.4s, v25.4s // ..................*.............................................................................|.................*............................................................................. - // sqrdmulh v11.4s, v11.4s, v26.4s // ....................*...........................................................................|...................*........................................................................... - // mls v15.4s, v11.4s, v8.s[0] // .......................................................................*........................|......................................................................*........................ - // mul v16.4s, v12.4s, v25.4s // ...................................................................*............................|..................................................................*............................ - // sqrdmulh v12.4s, v12.4s, v26.4s // ............................*...................................................................|...........................*................................................................... - // mls v16.4s, v12.4s, v8.s[0] // ..................................................................................*.............|.................................................................................*............. - // str q13, [x0], #(16) // ...............................................................*................................|..............................................................*................................ - // str q14, [x0, #(-16 + 1*(1024/8))] // ...................................*............................................................|..................................*............................................................ - // str q15, [x0, #(-16 + 2*(1024/8))] // .......................................................................................*........|......................................................................................*........ - // str q16, [x0, #(-16 + 3*(1024/8))] // ...............................................................................................*|..............................................................................................* + // ldr q9, [x0, #0] // .e......................................................................................................................|e.................................................................................................................... + // ldr q10, [x0, #(1*(1024/8))] // e.......................................................................................................................e..................................................................................................................... + // ldr q11, [x0, #(2*(1024/8))] // ..e.....................................................................................................................|.e................................................................................................................... + // ldr q12, [x0, #(3*(1024/8))] // ......e.................................................................................................................|.....e............................................................................................................... + // ldr q13, [x0, #(4*(1024/8))] // ..............................................e.........................................................................|.............................................e....................................................................... + // ldr q14, [x0, #(5*(1024/8))] // ...............................................e........................................................................|..............................................e...................................................................... + // ldr q15, [x0, #(6*(1024/8))] // .............................................................e..........................................................|............................................................e........................................................ + // ldr q16, [x0, #(7*(1024/8))] // .........................................................e..............................................................|........................................................e............................................................ + // sub v24.4s, v9.4s, v10.4s // ......................e.................................................................................................|.....................e............................................................................................... + // add v9.4s, v9.4s, v10.4s // .....................e..................................................................................................|....................e................................................................................................ + // mul v10.4s, v24.4s, v1.s[2] // ..................................................................................e.....................................|.................................................................................e................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ............................................................................e...........................................|...........................................................................e......................................... + // mls v10.4s, v24.4s, v8.s[0] // ...............................................................................................e........................|..............................................................................................e...................... + // sub v24.4s, v11.4s, v12.4s // .........................e..............................................................................................|........................e............................................................................................ + // add v11.4s, v11.4s, v12.4s // ........................e...............................................................................................|.......................e............................................................................................. + // mul v12.4s, v24.4s, v2.s[0] // ........................................................................e...............................................|.......................................................................e............................................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .....................................................................e..................................................|....................................................................e................................................ + // mls v12.4s, v24.4s, v8.s[0] // ....................................................................................e...................................|...................................................................................e................................. + // sub v24.4s, v13.4s, v14.4s // ..................................................................e.....................................................|.................................................................e................................................... + // add v13.4s, v13.4s, v14.4s // ...................................................................e....................................................|..................................................................e.................................................. + // mul v14.4s, v24.4s, v2.s[2] // .............................................................................e..........................................|............................................................................e........................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..........................................................................e.............................................|.........................................................................e........................................... + // mls v14.4s, v24.4s, v8.s[0] // ......................................................................................................e.................|.....................................................................................................e............... + // sub v24.4s, v15.4s, v16.4s // .................................................................................e......................................|................................................................................e.................................... + // add v15.4s, v15.4s, v16.4s // ...............................................................................e........................................|..............................................................................e...................................... + // mul v16.4s, v24.4s, v3.s[0] // .............................................................................................e..........................|............................................................................................e........................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ..............................................................................................e.........................|.............................................................................................e....................... + // mls v16.4s, v24.4s, v8.s[0] // .........................................................................................................e..............|........................................................................................................e............ + // sub v24.4s, v9.4s, v11.4s // ................................e.......................................................................................|...............................e..................................................................................... + // add v9.4s, v9.4s, v11.4s // ...............................e........................................................................................|..............................e...................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ............................................e...........................................................................|...........................................e......................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................e..........................................................................|............................................e........................................................................ + // mls v11.4s, v24.4s, v8.s[0] // ...............................................................e........................................................|..............................................................e...................................................... + // sub v24.4s, v10.4s, v12.4s // ..........................................................................................................e.............|.........................................................................................................e........... + // add v10.4s, v10.4s, v12.4s // ................................................................................................................e.......|...............................................................................................................e..... + // mul v12.4s, v24.4s, v0.s[2] // ..................................................................................................................e.....|.................................................................................................................e... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................................................................................................e....|..................................................................................................................e.. + // mls v12.4s, v24.4s, v8.s[0] // ...........*............................................................................................................|..........*.......................................................................................................... + // sub v24.4s, v13.4s, v15.4s // .........................................................................................e..............................|........................................................................................e............................ + // add v13.4s, v13.4s, v15.4s // ..........................................................................................e.............................|.........................................................................................e........................... + // mul v15.4s, v24.4s, v1.s[0] // .................................................................................................e......................|................................................................................................e.................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................e....................|..................................................................................................e.................. + // mls v15.4s, v24.4s, v8.s[0] // ............................................................................................................e...........|...........................................................................................................e......... + // sub v24.4s, v14.4s, v16.4s // ......................................................................................................................e.|..................................................................................................................... + // add v14.4s, v14.4s, v16.4s // .......................................................................................................................e|..................................................................................................................... + // mul v16.4s, v24.4s, v1.s[0] // ........*...............................................................................................................|.......*............................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........*..............................................................................................................|........*............................................................................................................ + // mls v16.4s, v24.4s, v8.s[0] // ....................*...................................................................................................|...................*................................................................................................. + // sub v24.4s, v9.4s, v13.4s // ..................................................................................................e.....................|.................................................................................................e................... + // add v9.4s, v9.4s, v13.4s // .......................................................................................................e................|......................................................................................................e.............. + // mul v13.4s, v24.4s, v0.s[0] // .............................................................................................................e..........|............................................................................................................e........ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................................................................e............|..........................................................................................................e.......... + // mls v13.4s, v24.4s, v8.s[0] // ....*...................................................................................................................|...*................................................................................................................. + // sub v24.4s, v10.4s, v14.4s // ..........*.............................................................................................................|.........*........................................................................................................... + // add v10.4s, v10.4s, v14.4s // ............*...........................................................................................................|...........*......................................................................................................... + // mul v14.4s, v24.4s, v0.s[0] // ..................*.....................................................................................................|.................*................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................*......................................................................................................|................*.................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ..............................*.........................................................................................|.............................*....................................................................................... + // sub v24.4s, v11.4s, v15.4s // .......*................................................................................................................|......*.............................................................................................................. + // add v11.4s, v11.4s, v15.4s // ...*....................................................................................................................|..*.................................................................................................................. + // mul v15.4s, v24.4s, v0.s[0] // ...............*........................................................................................................|..............*...................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................*.......................................................................................................|...............*..................................................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ...........................*............................................................................................|..........................*.......................................................................................... + // sub v24.4s, v12.4s, v16.4s // ..................................*.....................................................................................|.................................*................................................................................... + // add v12.4s, v12.4s, v16.4s // .................................*......................................................................................|................................*.................................................................................... + // mul v16.4s, v24.4s, v0.s[0] // .......................................................*................................................................|......................................................*.............................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................*.......................................................................|...............................................*..................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ....................................................................*...................................................|...................................................................*................................................. + // cmge v27.4s, v31.4s, v13.4s // ............................*...........................................................................................|...........................*......................................................................................... + // cmge v28.4s, v13.4s, v30.4s // .............................*..........................................................................................|............................*........................................................................................ + // sub v28.4s, v27.4s, v28.4s // .....................................*..................................................................................|....................................*................................................................................ + // mls v13.4s, v28.4s, v29.4s // .................................................*......................................................................|................................................*.................................................................... + // cmge v27.4s, v31.4s, v14.4s // .........................................*..............................................................................|........................................*............................................................................ + // cmge v28.4s, v14.4s, v30.4s // ..........................................*.............................................................................|.........................................*........................................................................... + // sub v28.4s, v27.4s, v28.4s // ..................................................*.....................................................................|.................................................*................................................................... + // mls v14.4s, v28.4s, v29.4s // ..........................................................*.............................................................|.........................................................*........................................................... + // cmge v27.4s, v31.4s, v15.4s // .......................................*................................................................................|......................................*.............................................................................. + // cmge v28.4s, v15.4s, v30.4s // ........................................*...............................................................................|.......................................*............................................................................. + // sub v28.4s, v27.4s, v28.4s // ...................................................*....................................................................|..................................................*.................................................................. + // mls v15.4s, v28.4s, v29.4s // ...........................................................*............................................................|..........................................................*.......................................................... + // cmge v27.4s, v31.4s, v16.4s // ......................................................................................*.................................|.....................................................................................*............................... + // cmge v28.4s, v16.4s, v30.4s // .....................................................................................*..................................|....................................................................................*................................ + // sub v28.4s, v27.4s, v28.4s // ...........................................................................................*............................|..........................................................................................*.......................... + // mls v16.4s, v28.4s, v29.4s // .....................................................................................................*..................|....................................................................................................*................ + // str q13, [x0, #(4*(1024/8))] // .................................................................*......................................................|................................................................*.................................................... + // str q14, [x0, #(5*(1024/8))] // .........................................................................*..............................................|........................................................................*............................................ + // str q15, [x0, #(6*(1024/8))] // ..............................................................................*.........................................|.............................................................................*....................................... + // str q16, [x0, #(7*(1024/8))] // .....................................................................................................................*..|....................................................................................................................* + // mul v13.4s, v9.4s, v25.4s // ...............................................................................................................e........|..............................................................................................................e...... + // sqrdmulh v9.4s, v9.4s, v26.4s // ..............................................................................................................e.........|.............................................................................................................e....... + // mls v13.4s, v9.4s, v8.s[0] // .....*..................................................................................................................|....*................................................................................................................ + // mul v14.4s, v10.4s, v25.4s // .......................*................................................................................................|......................*.............................................................................................. + // sqrdmulh v10.4s, v10.4s, v26.4s // ...................*....................................................................................................|..................*.................................................................................................. + // mls v14.4s, v10.4s, v8.s[0] // ...................................*....................................................................................|..................................*.................................................................................. + // mul v15.4s, v11.4s, v25.4s // .............*..........................................................................................................|............*........................................................................................................ + // sqrdmulh v11.4s, v11.4s, v26.4s // ..............*.........................................................................................................|.............*....................................................................................................... + // mls v15.4s, v11.4s, v8.s[0] // ..........................*.............................................................................................|.........................*........................................................................................... + // mul v16.4s, v12.4s, v25.4s // ............................................................*...........................................................|...........................................................*......................................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ....................................................*...................................................................|...................................................*................................................................. + // mls v16.4s, v12.4s, v8.s[0] // ...........................................................................*............................................|..........................................................................*.......................................... + // cmge v27.4s, v31.4s, v13.4s // ......................................................*.................................................................|.....................................................*............................................................... + // cmge v28.4s, v13.4s, v30.4s // ..............................................................*.........................................................|.............................................................*....................................................... + // sub v28.4s, v27.4s, v28.4s // ......................................................................*.................................................|.....................................................................*............................................... + // mls v13.4s, v28.4s, v29.4s // ................................................................................*.......................................|...............................................................................*..................................... + // cmge v27.4s, v31.4s, v14.4s // .....................................................*..................................................................|....................................................*................................................................ + // cmge v28.4s, v14.4s, v30.4s // ................................................................*.......................................................|...............................................................*..................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................*....................................|..................................................................................*.................................. + // mls v14.4s, v28.4s, v29.4s // ....................................................................................................*...................|...................................................................................................*................. + // cmge v27.4s, v31.4s, v15.4s // ....................................*...................................................................................|...................................*................................................................................. + // cmge v28.4s, v15.4s, v30.4s // ......................................*.................................................................................|.....................................*............................................................................... + // sub v28.4s, v27.4s, v28.4s // ...........................................*............................................................................|..........................................*.......................................................................... + // mls v15.4s, v28.4s, v29.4s // ........................................................*...............................................................|.......................................................*............................................................. + // cmge v27.4s, v31.4s, v16.4s // ........................................................................................*...............................|.......................................................................................*............................. + // cmge v28.4s, v16.4s, v30.4s // .......................................................................................*................................|......................................................................................*.............................. + // sub v28.4s, v27.4s, v28.4s // ................................................................................................*.......................|...............................................................................................*..................... + // mls v16.4s, v28.4s, v29.4s // ........................................................................................................*...............|.......................................................................................................*............. + // str q13, [x0], #(16) // ............................................................................................*...........................|...........................................................................................*......................... + // str q14, [x0, #(-16 + 1*(1024/8))] // .................................................................................................................*......|................................................................................................................*.... + // str q15, [x0, #(-16 + 2*(1024/8))] // .......................................................................*................................................|......................................................................*.............................................. + // str q16, [x0, #(-16 + 3*(1024/8))] // ....................................................................................................................*...|...................................................................................................................*. sub count, count, #1 cbnz count, layer123_start - mul v9.4S, v30.4S, v25.4S // .......*.............................. - sqrdmulh v31.4S, v17.4S, v26.4S // .*.................................... - sqrdmulh v7.4S, v30.4S, v26.4S // ...*.................................. - mls v13.4S, v22.4S, v8.S[0] // *..................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mul v22.4S, v17.4S, v25.4S // ..*................................... - mul v5.4S, v28.4S, v0.S[0] // ....*................................. - sqrdmulh v27.4S, v28.4S, v0.S[1] // ........*............................. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v14.4S, v4.4S, v8.4S // .....*................................ - mls v10.4S, v21.4S, v8.S[0] // .........*............................ - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v9.4S, v7.4S, v8.S[0] // ...............*...................... - sub v19.4S, v24.4S, v13.4S // .............*........................ - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - add v17.4S, v24.4S, v13.4S // ...........*.......................... - // gap // ...................................... - mls v22.4S, v31.4S, v8.S[0] // ................*..................... - mls v5.4S, v27.4S, v8.S[0] // .......................*.............. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q14, [x0, #512] // ..............*....................... - mul v31.4S, v19.4S, v0.S[0] // .................*.................... - sqrdmulh v4.4S, v19.4S, v0.S[1] // ....................*................. - sqrdmulh v27.4S, v17.4S, v26.4S // ..................*................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q9, [x0, #128] // .....................*................ - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - add v7.4S, v15.4S, v6.4S // ......*............................... - // gap // ...................................... - // gap // ...................................... - srshr v15.4S, v10.4S, #23 // ...................*.................. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mul v18.4S, v17.4S, v25.4S // .............................*........ - str q22, [x0], #(16) // ..........................*........... - mls v31.4S, v4.4S, v8.S[0] // ......................*............... - srshr v20.4S, v5.4S, #23 // ...........................*.......... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mul v23.4S, v7.4S, v25.4S // ..........*........................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sqrdmulh v12.4S, v7.4S, v26.4S // ............*......................... - mls v10.4S, v15.4S, v8.4S // .........................*............ - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v18.4S, v27.4S, v8.S[0] // ..................................*... - srshr v6.4S, v31.4S, #23 // ........................*............. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v5.4S, v20.4S, v8.4S // ................................*..... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v23.4S, v12.4S, v8.S[0] // ...............................*...... - str q10, [x0, #624] // ..............................*....... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v31.4S, v6.4S, v8.4S // ............................*......... - str q18, [x0, #368] // .....................................* - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q5, [x0, #752] // ....................................*. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q23, [x0, #240] // ...................................*.. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q31, [x0, #880] // .................................*.... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... + mls v28.4S, v24.4S, v8.S[0] // .......*.............................................................. + mul v13.4S, v15.4S, v1.S[0] // ....*................................................................. + sqrdmulh v21.4S, v15.4S, v1.S[1] // .....*................................................................ + sub v11.4S, v17.4S, v4.4S // ...*.................................................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + add v20.4S, v17.4S, v4.4S // *..................................................................... + add v15.4S, v5.4S, v22.4S // ........*............................................................. + mls v14.4S, v7.4S, v8.S[0] // .*.................................................................... + mls v12.4S, v23.4S, v8.S[0] // ..*................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sub v9.4S, v5.4S, v22.4S // ......*............................................................... + sqrdmulh v4.4S, v11.4S, v0.S[1] // ............*......................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v23.4S, v20.4S, v26.4S // ..........*........................................................... + mul v20.4S, v20.4S, v25.4S // .........*............................................................ + sqrdmulh v19.4S, v15.4S, v26.4S // ...............*...................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v13.4S, v21.4S, v8.S[0] // ................*..................................................... + cmge v6.4S, v31.4S, v12.4S // ........................................*............................. + mul v18.4S, v9.4S, v0.S[0] // ..............*....................................................... + mul v27.4S, v15.4S, v25.4S // .................*.................................................... + mul v16.4S, v11.4S, v0.S[0] // ...........*.......................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + cmge v10.4S, v31.4S, v14.4S // ....................*................................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + cmge v7.4S, v14.4S, v30.4S // .....................*................................................ + cmge v17.4S, v12.4S, v30.4S // ..............................................*....................... + add v5.4S, v28.4S, v13.4S // .......................*.............................................. + sub v15.4S, v28.4S, v13.4S // ........................*............................................. + mls v20.4S, v23.4S, v8.S[0] // ..................*................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v22.4S, v9.4S, v0.S[1] // .............*........................................................ + sub v28.4S, v6.4S, v17.4S // ..................................................*................... + mls v16.4S, v4.4S, v8.S[0] // ...................*.................................................. + mls v27.4S, v19.4S, v8.S[0] // .........................*............................................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v21.4S, v5.4S, v25.4S // .............................................*........................ + sqrdmulh v9.4S, v5.4S, v26.4S // ......................................*............................... + sqrdmulh v11.4S, v15.4S, v0.S[1] // ..................................*................................... + mul v17.4S, v15.4S, v0.S[0] // .........................................*............................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v12.4S, v28.4S, v29.4S // .......................................................*.............. + sub v4.4S, v10.4S, v7.4S // ...........................*.......................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + cmge v10.4S, v31.4S, v20.4S // ..........................*........................................... + mls v18.4S, v22.4S, v8.S[0] // ......................*............................................... + cmge v19.4S, v20.4S, v30.4S // ............................*......................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + cmge v24.4S, v16.4S, v30.4S // ..............................*....................................... + cmge v15.4S, v31.4S, v16.4S // .............................*........................................ + mls v14.4S, v4.4S, v29.4S // ...................................*.................................. + mls v21.4S, v9.4S, v8.S[0] // .....................................................*................ + mls v17.4S, v11.4S, v8.S[0] // .................................................*.................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q12, [x0], #(16) // ..............................................................*....... + sub v5.4S, v10.4S, v19.4S // .................................*.................................... + sub v4.4S, v15.4S, v24.4S // .....................................*................................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + cmge v15.4S, v18.4S, v30.4S // ................................*..................................... + cmge v23.4S, v27.4S, v30.4S // ...............................................*...................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + cmge v24.4S, v31.4S, v27.4S // .......................................*.............................. + cmge v6.4S, v31.4S, v18.4S // ...............................*...................................... + str q14, [x0, #496] // ................................................*..................... + cmge v13.4S, v17.4S, v30.4S // .........................................................*............ + cmge v28.4S, v31.4S, v17.4S // ..........................................................*........... + cmge v14.4S, v21.4S, v30.4S // ...........................................................*.......... + cmge v22.4S, v31.4S, v21.4S // ............................................................*......... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v20.4S, v5.4S, v29.4S // ..........................................*........................... + sub v10.4S, v6.4S, v15.4S // ....................................*................................. + sub v9.4S, v24.4S, v23.4S // ........................................................*............. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v16.4S, v4.4S, v29.4S // ............................................*......................... + sub v12.4S, v28.4S, v13.4S // .............................................................*........ + sub v22.4S, v22.4S, v14.4S // ...............................................................*...... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v27.4S, v9.4S, v29.4S // ................................................................*..... + mls v18.4S, v10.4S, v29.4S // ...........................................*.......................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v21.4S, v22.4S, v29.4S // ..................................................................*... + mls v17.4S, v12.4S, v29.4S // .................................................................*.... + str q20, [x0, #240] // ...................................................*.................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q16, [x0, #752] // ......................................................*............... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q27, [x0, #112] // ...................................................................*.. + str q18, [x0, #624] // ....................................................*................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q21, [x0, #368] // ....................................................................*. + str q17, [x0, #880] // .....................................................................* + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... // original source code - // mls v13.4S, v22.4S, v8.S[0] // ...*.................................. - // sqrdmulh v29.4S, v17.4S, v26.4S // .*.................................... - // mul v23.4S, v17.4S, v25.4S // ....*................................. - // sqrdmulh v22.4S, v30.4S, v26.4S // ..*................................... - // mul v18.4S, v28.4S, v0.S[0] // .....*................................ - // mls v14.4S, v4.4S, v8.4S // .......*.............................. - // add v20.4S, v15.4S, v6.4S // ...................*.................. - // mul v27.4S, v30.4S, v25.4S // *..................................... - // sqrdmulh v17.4S, v28.4S, v0.S[1] // ......*............................... - // mls v10.4S, v21.4S, v8.S[0] // ........*............................. - // mul v31.4S, v20.4S, v25.4S // .........................*............ - // add v5.4S, v24.4S, v13.4S // ...........*.......................... - // sqrdmulh v20.4S, v20.4S, v26.4S // ..........................*........... - // sub v28.4S, v24.4S, v13.4S // ..........*........................... - // str q14, [x0, #512] // ..............*....................... - // mls v27.4S, v22.4S, v8.S[0] // .........*............................ - // mls v23.4S, v29.4S, v8.S[0] // ............*......................... - // mul v22.4S, v28.4S, v0.S[0] // ...............*...................... - // sqrdmulh v4.4S, v5.4S, v26.4S // .................*.................... - // srshr v29.4S, v10.4S, #23 // ....................*................. - // sqrdmulh v11.4S, v28.4S, v0.S[1] // ................*..................... - // str q27, [x0, #128] // ..................*................... - // mls v22.4S, v11.4S, v8.S[0] // .......................*.............. - // mls v18.4S, v17.4S, v8.S[0] // .............*........................ - // srshr v11.4S, v22.4S, #23 // .............................*........ - // mls v10.4S, v29.4S, v8.4S // ...........................*.......... - // str q23, [x0], #(16) // ......................*............... - // srshr v12.4S, v18.4S, #23 // ........................*............. - // mls v22.4S, v11.4S, v8.4S // .................................*.... - // mul v11.4S, v5.4S, v25.4S // .....................*................ - // str q10, [x0, #624] // ................................*..... - // mls v31.4S, v20.4S, v8.S[0] // ...............................*...... - // mls v18.4S, v12.4S, v8.4S // ..............................*....... - // str q22, [x0, #880] // .....................................* - // mls v11.4S, v4.4S, v8.S[0] // ............................*......... - // str q31, [x0, #240] // ....................................*. - // str q18, [x0, #752] // ...................................*.. - // str q11, [x0, #368] // ..................................*... + // add v19.4S, v17.4S, v4.4S // ....*................................................................. + // mls v14.4S, v7.4S, v8.S[0] // ......*............................................................... + // mls v12.4S, v23.4S, v8.S[0] // .......*.............................................................. + // sub v27.4S, v17.4S, v4.4S // ...*.................................................................. + // mul v4.4S, v15.4S, v1.S[0] // .*.................................................................... + // sqrdmulh v18.4S, v15.4S, v1.S[1] // ..*................................................................... + // sub v15.4S, v5.4S, v22.4S // ........*............................................................. + // mls v28.4S, v24.4S, v8.S[0] // *..................................................................... + // add v11.4S, v5.4S, v22.4S // .....*................................................................ + // mul v13.4S, v19.4S, v25.4S // ...........*.......................................................... + // sqrdmulh v10.4S, v19.4S, v26.4S // ..........*........................................................... + // mul v5.4S, v27.4S, v0.S[0] // .................*.................................................... + // sqrdmulh v21.4S, v27.4S, v0.S[1] // .........*............................................................ + // sqrdmulh v9.4S, v15.4S, v0.S[1] // ........................*............................................. + // mul v19.4S, v15.4S, v0.S[0] // ...............*...................................................... + // sqrdmulh v17.4S, v11.4S, v26.4S // ............*......................................................... + // mls v4.4S, v18.4S, v8.S[0] // .............*........................................................ + // mul v15.4S, v11.4S, v25.4S // ................*..................................................... + // mls v13.4S, v10.4S, v8.S[0] // .......................*.............................................. + // mls v5.4S, v21.4S, v8.S[0] // ..........................*........................................... + // cmge v21.4S, v31.4S, v14.4S // ..................*................................................... + // cmge v10.4S, v14.4S, v30.4S // ...................*.................................................. + // mls v19.4S, v9.4S, v8.S[0] // ...................................*.................................. + // add v27.4S, v28.4S, v4.4S // .....................*................................................ + // sub v9.4S, v28.4S, v4.4S // ......................*............................................... + // mls v15.4S, v17.4S, v8.S[0] // ...........................*.......................................... + // cmge v22.4S, v31.4S, v13.4S // ..................................*................................... + // sub v10.4S, v21.4S, v10.4S // .................................*.................................... + // cmge v17.4S, v13.4S, v30.4S // ....................................*................................. + // cmge v21.4S, v31.4S, v5.4S // ......................................*............................... + // cmge v4.4S, v5.4S, v30.4S // .....................................*................................ + // cmge v6.4S, v31.4S, v19.4S // ................................................*..................... + // cmge v18.4S, v19.4S, v30.4S // .............................................*........................ + // sub v22.4S, v22.4S, v17.4S // ...........................................*.......................... + // sqrdmulh v11.4S, v9.4S, v0.S[1] // ..............................*....................................... + // mls v14.4S, v10.4S, v29.4S // .......................................*.............................. + // sub v10.4S, v6.4S, v18.4S // .......................................................*.............. + // sub v18.4S, v21.4S, v4.4S // ............................................*......................... + // sqrdmulh v4.4S, v27.4S, v26.4S // .............................*........................................ + // cmge v6.4S, v31.4S, v15.4S // ...............................................*...................... + // cmge v21.4S, v31.4S, v12.4S // ..............*....................................................... + // mul v9.4S, v9.4S, v0.S[0] // ...............................*...................................... + // mls v13.4S, v22.4S, v29.4S // ......................................................*............... + // mls v19.4S, v10.4S, v29.4S // .............................................................*........ + // mls v5.4S, v18.4S, v29.4S // .........................................................*............ + // mul v18.4S, v27.4S, v25.4S // ............................*......................................... + // cmge v10.4S, v12.4S, v30.4S // ....................*................................................. + // cmge v23.4S, v15.4S, v30.4S // ..............................................*....................... + // str q14, [x0, #512] // .................................................*.................... + // mls v9.4S, v11.4S, v8.S[0] // .........................................*............................ + // sub v10.4S, v21.4S, v10.4S // .........................*............................................ + // str q13, [x0, #256] // ................................................................*..... + // str q19, [x0, #640] // ...................................................................*.. + // mls v18.4S, v4.4S, v8.S[0] // ........................................*............................. + // str q5, [x0, #768] // .................................................................*.... + // mls v12.4S, v10.4S, v29.4S // ................................*..................................... + // sub v5.4S, v6.4S, v23.4S // ........................................................*............. + // cmge v6.4S, v9.4S, v30.4S // ..................................................*................... + // cmge v22.4S, v31.4S, v9.4S // ...................................................*.................. + // cmge v14.4S, v18.4S, v30.4S // ....................................................*................. + // cmge v4.4S, v31.4S, v18.4S // .....................................................*................ + // sub v6.4S, v22.4S, v6.4S // ..........................................................*........... + // str q12, [x0], #(16) // ..........................................*........................... + // sub v12.4S, v4.4S, v14.4S // ...........................................................*.......... + // mls v15.4S, v5.4S, v29.4S // ............................................................*......... + // mls v9.4S, v6.4S, v29.4S // ...............................................................*...... + // mls v18.4S, v12.4S, v29.4S // ..............................................................*....... + // str q15, [x0, #112] // ..................................................................*... + // str q18, [x0, #368] // ....................................................................*. + // str q9, [x0, #880] // .....................................................................* pop_stack diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_icestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_icestorm.s new file mode 100644 index 0000000..671e038 --- /dev/null +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_icestorm.s @@ -0,0 +1,1999 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro montg_reduce a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, consts +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_opt_m1_icestorm + .global _intt_dilithium_123_45678_opt_m1_icestorm + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_opt_m1_icestorm: +_intt_dilithium_123_45678_opt_m1_icestorm: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + // gap // ........................................................................................................................... + ldr q10, [x5, #144] // ..........*................................................................................................................ + ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1] // ........*.................................................................................................................. + ld4 {v23.4S, v24.4S, v25.4S, v26.4S}, [x2] // .................*......................................................................................................... + ldr q9, [x5, #160] // ...........*............................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + ldr q21, [x5, #80] // *.......................................................................................................................... + ldr q1, [x5, #176] // .*......................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + ldr q6, [x5, #128] // ...*....................................................................................................................... + ldr q30, [x5, #64] // ....*...................................................................................................................... + ldr q27, [x5, #32] // ......*.................................................................................................................... + // gap // ........................................................................................................................... + ldr q28, [x5, #48] // .....*..................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + sub v20.4S, v16.4S, v17.4S // ..............*............................................................................................................ + sub v31.4S, v14.4S, v15.4S // .............*............................................................................................................. + // gap // ........................................................................................................................... + sub v13.4S, v23.4S, v24.4S // .............................*............................................................................................. + sub v7.4S, v25.4S, v26.4S // ............................*.............................................................................................. + // gap // ........................................................................................................................... + ldr q5, [x5, #112] // .......*................................................................................................................... + mul v2.4S, v31.4S, v27.4S // ..................*........................................................................................................ + ldr q27, [x5, #16] // .........*................................................................................................................. + sqrdmulh v4.4S, v31.4S, v28.4S // ...................*....................................................................................................... + // gap // ........................................................................................................................... + sqrdmulh v31.4S, v20.4S, v21.4S // ....................*...................................................................................................... + mul v12.4S, v7.4S, v9.4S // .....................................*..................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + ldr q21, [x5], #(12*16) // ..*........................................................................................................................ + sqrdmulh v22.4S, v13.4S, v10.4S // .......................................*................................................................................... + ldr q11, [x5, #-96] // ......................*.................................................................................................... + mul v10.4S, v20.4S, v30.4S // .....................*..................................................................................................... + mls v2.4S, v4.4S, v8.S[0] // .........................*................................................................................................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + sqrdmulh v20.4S, v7.4S, v1.4S // .................................*......................................................................................... + add v28.4S, v25.4S, v26.4S // ...............................*........................................................................................... + add v26.4S, v14.4S, v15.4S // ................*.......................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + add v18.4S, v16.4S, v17.4S // ...............*........................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mls v10.4S, v31.4S, v8.S[0] // ..........................*................................................................................................ + mul v9.4S, v13.4S, v6.4S // ...................................*....................................................................................... + // gap // ........................................................................................................................... + mls v12.4S, v20.4S, v8.S[0] // .............................................*............................................................................. + // gap // ........................................................................................................................... + sub v1.4S, v26.4S, v18.4S // ........................*.................................................................................................. + // gap // ........................................................................................................................... + add v23.4S, v23.4S, v24.4S // ................................*.......................................................................................... + // gap // ........................................................................................................................... + add v20.4S, v2.4S, v10.4S // ...............................................*........................................................................... + sub v13.4S, v2.4S, v10.4S // ....................................*...................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + sqrdmulh v31.4S, v1.4S, v27.4S // .........................................*................................................................................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mls v9.4S, v22.4S, v8.S[0] // ..............................................*............................................................................ + mul v22.4S, v13.4S, v21.4S // ...........................................*............................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + sub v4.4S, v23.4S, v28.4S // ......................................*.................................................................................... + mul v24.4S, v1.4S, v21.4S // ..................................*........................................................................................ + sqrdmulh v1.4S, v13.4S, v27.4S // ........................................*.................................................................................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + add v13.4S, v9.4S, v12.4S // ......................................................*.................................................................... + sub v12.4S, v9.4S, v12.4S // ...................................................*....................................................................... + ldr q30, [x4, #48] // ...........................................................................*............................................... + sqrdmulh v9.4S, v4.4S, v5.4S // ..........................................*................................................................................ + add v26.4S, v26.4S, v18.4S // .......................*................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mls v24.4S, v31.4S, v8.S[0] // ................................................*.......................................................................... + mls v22.4S, v1.4S, v8.S[0] // .................................................*......................................................................... + mul v10.4S, v4.4S, v11.4S // ............................................*.............................................................................. + add v1.4S, v23.4S, v28.4S // ....................................................*...................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + ldr q29, [x4, #16] // ............*.............................................................................................................. + ldr q19, [x4], #64 // ...........................*............................................................................................... + trn2 v16.4S, v26.4S, v20.4S // .....................................................*..................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + trn1 v20.4S, v26.4S, v20.4S // ...........................................................*............................................................... + trn1 v26.4S, v24.4S, v22.4S // ............................................................*.............................................................. + trn2 v4.4S, v24.4S, v22.4S // .........................................................*................................................................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mls v10.4S, v9.4S, v8.S[0] // ..................................................*........................................................................ + // gap // ........................................................................................................................... + ldr q14, [x4, #-32] // ..............................*............................................................................................ + trn2 v23.4S, v1.4S, v13.4S // ..........................................................*................................................................ + sqrdmulh v5.4S, v12.4S, v5.4S // ........................................................*.................................................................. + trn1 v17.2D, v20.2D, v26.2D // ...............................................................*........................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mul v25.4S, v12.4S, v11.4S // .......................................................*................................................................... + // gap // ........................................................................................................................... + trn2 v9.2D, v20.2D, v26.2D // .................................................................*......................................................... + trn2 v24.2D, v16.2D, v4.2D // ..............................................................*............................................................ + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + trn1 v1.4S, v1.4S, v13.4S // ..................................................................*........................................................ + trn1 v7.2D, v16.2D, v4.2D // ................................................................*.......................................................... + sub v12.4S, v9.4S, v24.4S // .................................................................................*......................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mls v25.4S, v5.4S, v8.S[0] // .............................................................*............................................................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + add v20.4S, v9.4S, v24.4S // ......................................................................*.................................................... + add v13.4S, v17.4S, v7.4S // .....................................................................*..................................................... + mul v31.4S, v12.4S, v14.S[0] // .......................................................................................*................................... + sqrdmulh v22.4S, v12.4S, v14.S[1] // ......................................................................................*.................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + trn2 v26.4S, v10.4S, v25.4S // ....................................................................*...................................................... + trn1 v24.4S, v10.4S, v25.4S // ...................................................................*....................................................... + sub v12.4S, v13.4S, v20.4S // ...................................................................................................*....................... + add v10.4S, v13.4S, v20.4S // ...............................................................................*........................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + trn1 v20.2D, v23.2D, v26.2D // ..........................................................................*................................................ + trn1 v13.2D, v1.2D, v24.2D // .........................................................................*................................................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + trn2 v0.2D, v23.2D, v26.2D // ........................................................................*.................................................. + trn2 v16.2D, v1.2D, v24.2D // .......................................................................*................................................... + sub v9.4S, v13.4S, v20.4S // .............................................................................................*............................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mul v4.4S, v12.4S, v19.S[2] // ..........................................................................................................................* + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + add v26.4S, v16.4S, v0.4S // .............................................................................*............................................. + add v24.4S, v13.4S, v20.4S // ..............................................................................*............................................ + sqrdmulh v20.4S, v9.4S, v14.S[3] // .................................................................................................*......................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mul v11.4S, v9.4S, v14.S[2] // ................................................................................................*.......................... + // gap // ........................................................................................................................... + add v27.4S, v24.4S, v26.4S // ........................................................................................*.................................. + // gap // ........................................................................................................................... + sub v1.4S, v17.4S, v7.4S // ............................................................................*.............................................. + // gap // ........................................................................................................................... + srshr v9.4S, v10.4S, #23 // .....................................................................................*..................................... + // gap // ........................................................................................................................... + sub v17.4S, v16.4S, v0.4S // ................................................................................*.......................................... + mul v16.4S, v1.4S, v29.S[2] // ...................................................................................*....................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + srshr v13.4S, v27.4S, #23 // ............................................................................................*.............................. + mls v31.4S, v22.4S, v8.S[0] // ...............................................................................................*........................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + sqrdmulh v1.4S, v1.4S, v29.S[3] // ....................................................................................*...................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mls v10.4S, v9.4S, v8.4S // ...........................................................................................*............................... + mls v11.4S, v20.4S, v8.S[0] // .......................................................................................................*................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mls v27.4S, v13.4S, v8.4S // ..................................................................................................*........................ + mul v15.4S, v17.4S, v30.S[0] // .........................................................................................*................................. + // gap // ........................................................................................................................... + mls v16.4S, v1.4S, v8.S[0] // ..........................................................................................*................................ + // gap // ........................................................................................................................... + sqrdmulh v7.4S, v17.4S, v30.S[1] // ..............................................................................................*............................ + sqrdmulh v30.4S, v12.4S, v19.S[3] // ......................................................................................................................*.... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + sub v9.4S, v24.4S, v26.4S // ..................................................................................*........................................ + // gap // ........................................................................................................................... + add v24.4S, v10.4S, v27.4S // ..............................................................................................................*............ + sub v22.4S, v10.4S, v27.4S // ........................................................................................................*.................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + sub v10.4S, v16.4S, v31.4S // ......................................................................................................*.................... + // gap // ........................................................................................................................... + mls v15.4S, v7.4S, v8.S[0] // .....................................................................................................*..................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mul v17.4S, v22.4S, v19.S[0] // .............................................................................................................*............. + sqrdmulh v12.4S, v22.4S, v19.S[1] // ............................................................................................................*.............. + str q24, [x1], #(16*4) // .....................................................................................................................*..... + sqrdmulh v22.4S, v10.4S, v19.S[3] // ...........................................................................................................*............... + // gap // ........................................................................................................................... + mul v10.4S, v10.4S, v19.S[2] // ..........................................................................................................*................ + // gap // ........................................................................................................................... + add v26.4S, v16.4S, v31.4S // ....................................................................................................*...................... + sub v13.4S, v11.4S, v15.4S // ...............................................................................................................*........... + // gap // ........................................................................................................................... + add v31.4S, v11.4S, v15.4S // .................................................................................................................*......... + mls v17.4S, v12.4S, v8.S[0] // ..................................................................................................................*........ + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mls v10.4S, v22.4S, v8.S[0] // ................................................................................................................*.......... + srshr v22.4S, v26.4S, #23 // .........................................................................................................*................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + sqrdmulh v20.4S, v13.4S, v29.S[1] // .......................................................................................................................*... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mul v1.4S, v13.4S, v29.S[0] // ...................................................................................................................*....... + str q17, [x2], #(16*4) // ........................................................................................................................*.. + // gap // ........................................................................................................................... + mul v7.4S, v9.4S, v29.S[0] // .........................................................................................................................*. + srshr v12.4S, v31.4S, #23 // ....................................................................................................................*...... + + // original source code + // ldr q17, [x5, #80] // ....*...................................................................................................................... + // ldr q16, [x5, #176] // .....*..................................................................................................................... + // ldr q15, [x5], #(12*16) // ....................*...................................................................................................... + // ldr q24, [x5, #-64] // ......*.................................................................................................................... + // ldr q28, [x5, #-128] // .......*................................................................................................................... + // ldr q18, [x5, #-144] // .........*................................................................................................................. + // ldr q25, [x5, #-160] // ........*.................................................................................................................. + // ldr q7, [x5, #-80] // ..............*............................................................................................................ + // ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // .*......................................................................................................................... + // ldr q0, [x5, #-176] // ................*.......................................................................................................... + // ldr q30, [x5, #-48] // *.......................................................................................................................... + // ldr q27, [x5, #-32] // ...*....................................................................................................................... + // ldr q29, [x4, #16] // ...................................................*....................................................................... + // sub v22.4S, v3.4S, v4.4S // ...........*............................................................................................................... + // sub v12.4S, v5.4S, v6.4S // ..........*................................................................................................................ + // add v11.4S, v5.4S, v6.4S // ............................*.............................................................................................. + // add v20.4S, v3.4S, v4.4S // ...........................*............................................................................................... + // ld4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x2] // ..*........................................................................................................................ + // mul v10.4S, v22.4S, v25.4S // ...............*........................................................................................................... + // sqrdmulh v18.4S, v22.4S, v18.4S // .................*......................................................................................................... + // sqrdmulh v26.4S, v12.4S, v17.4S // ..................*........................................................................................................ + // mul v28.4S, v12.4S, v28.4S // .......................*................................................................................................... + // ldr q31, [x5, #-96] // ......................*.................................................................................................... + // add v5.4S, v20.4S, v11.4S // ..............................................*............................................................................ + // sub v12.4S, v20.4S, v11.4S // ................................*.......................................................................................... + // mls v10.4S, v18.4S, v8.S[0] // ........................*.................................................................................................. + // mls v28.4S, v26.4S, v8.S[0] // .............................*............................................................................................. + // ldr q19, [x4], #64 // ....................................................*...................................................................... + // sub v9.4S, v3.4S, v4.4S // .............*............................................................................................................. + // sub v17.4S, v1.4S, v2.4S // ............*.............................................................................................................. + // ldr q26, [x4, #-32] // ..........................................................*................................................................ + // add v11.4S, v3.4S, v4.4S // ..........................*................................................................................................ + // add v22.4S, v1.4S, v2.4S // .................................*......................................................................................... + // sqrdmulh v4.4S, v9.4S, v16.4S // .........................*................................................................................................. + // mul v6.4S, v12.4S, v15.4S // ........................................*.................................................................................. + // mul v13.4S, v17.4S, v24.4S // ..............................*............................................................................................ + // sub v18.4S, v10.4S, v28.4S // ...................................*....................................................................................... + // mul v23.4S, v9.4S, v27.4S // ...................*....................................................................................................... + // sub v20.4S, v22.4S, v11.4S // .......................................*................................................................................... + // sqrdmulh v3.4S, v17.4S, v30.4S // .....................*..................................................................................................... + // sqrdmulh v1.4S, v18.4S, v0.4S // .........................................*................................................................................. + // sqrdmulh v17.4S, v12.4S, v0.4S // ....................................*...................................................................................... + // sqrdmulh v12.4S, v20.4S, v7.4S // .............................................*............................................................................. + // mul v24.4S, v18.4S, v15.4S // ......................................*.................................................................................... + // mul v25.4S, v20.4S, v31.4S // .................................................*......................................................................... + // mls v23.4S, v4.4S, v8.S[0] // ...............................*........................................................................................... + // mls v13.4S, v3.4S, v8.S[0] // .....................................*..................................................................................... + // add v14.4S, v10.4S, v28.4S // ..................................*........................................................................................ + // mls v6.4S, v17.4S, v8.S[0] // ...............................................*........................................................................... + // mls v24.4S, v1.4S, v8.S[0] // ................................................*.......................................................................... + // mls v25.4S, v12.4S, v8.S[0] // .........................................................*................................................................. + // sub v20.4S, v13.4S, v23.4S // ...........................................*............................................................................... + // add v27.4S, v22.4S, v11.4S // ..................................................*........................................................................ + // trn2 v9.4S, v5.4S, v14.4S // .....................................................*..................................................................... + // add v16.4S, v13.4S, v23.4S // ..........................................*................................................................................ + // mul v31.4S, v20.4S, v31.4S // ..............................................................*............................................................ + // sqrdmulh v30.4S, v20.4S, v7.4S // ............................................................*.............................................................. + // trn2 v12.4S, v6.4S, v24.4S // ........................................................*.................................................................. + // trn2 v13.4S, v27.4S, v16.4S // ...........................................................*............................................................... + // trn1 v4.4S, v5.4S, v14.4S // ......................................................*.................................................................... + // trn1 v20.4S, v6.4S, v24.4S // .......................................................*................................................................... + // mls v31.4S, v30.4S, v8.S[0] // ....................................................................*...................................................... + // trn2 v1.2D, v9.2D, v12.2D // ................................................................*.......................................................... + // trn1 v22.2D, v4.2D, v20.2D // .............................................................*............................................................. + // trn1 v17.2D, v9.2D, v12.2D // ..................................................................*........................................................ + // trn2 v4.2D, v4.2D, v20.2D // ...............................................................*........................................................... + // trn1 v9.4S, v27.4S, v16.4S // .................................................................*......................................................... + // trn1 v20.4S, v25.4S, v31.4S // ..........................................................................*................................................ + // trn2 v16.4S, v25.4S, v31.4S // .........................................................................*................................................. + // add v10.4S, v22.4S, v17.4S // ......................................................................*.................................................... + // add v31.4S, v4.4S, v1.4S // .....................................................................*..................................................... + // trn2 v12.2D, v9.2D, v20.2D // ................................................................................*.......................................... + // trn2 v30.2D, v13.2D, v16.2D // ...............................................................................*........................................... + // trn1 v23.2D, v9.2D, v20.2D // ..............................................................................*............................................ + // trn1 v28.2D, v13.2D, v16.2D // .............................................................................*............................................. + // ldr q24, [x4, #-16] // ............................................*.............................................................................. + // sub v22.4S, v22.4S, v17.4S // ........................................................................................*.................................. + // add v13.4S, v12.4S, v30.4S // ...................................................................................*....................................... + // add v20.4S, v23.4S, v28.4S // ....................................................................................*...................................... + // add v17.4S, v10.4S, v31.4S // ............................................................................*.............................................. + // sub v12.4S, v12.4S, v30.4S // ..........................................................................................*................................ + // sub v4.4S, v4.4S, v1.4S // ...................................................................*....................................................... + // sub v9.4S, v20.4S, v13.4S // ......................................................................................................*.................... + // mul v16.4S, v22.4S, v29.S[2] // ...........................................................................................*............................... + // sqrdmulh v30.4S, v22.4S, v29.S[3] // ..............................................................................................*............................ + // srshr v22.4S, v17.4S, #23 // .........................................................................................*................................. + // sqrdmulh v1.4S, v4.4S, v26.S[1] // ........................................................................*.................................................. + // mul v6.4S, v4.4S, v26.S[0] // .......................................................................*................................................... + // add v4.4S, v20.4S, v13.4S // .......................................................................................*................................... + // mul v25.4S, v12.4S, v24.S[0] // ..................................................................................................*........................ + // mls v16.4S, v30.4S, v8.S[0] // ...................................................................................................*....................... + // mls v17.4S, v22.4S, v8.4S // ...............................................................................................*........................... + // srshr v20.4S, v4.4S, #23 // ............................................................................................*.............................. + // sub v22.4S, v23.4S, v28.4S // .................................................................................*......................................... + // sqrdmulh v12.4S, v12.4S, v24.S[1] // ....................................................................................................*...................... + // mls v6.4S, v1.4S, v8.S[0] // .............................................................................................*............................. + // mul v1.4S, v22.4S, v26.S[2] // ......................................................................................*.................................... + // sqrdmulh v22.4S, v22.4S, v26.S[3] // .....................................................................................*..................................... + // mls v4.4S, v20.4S, v8.4S // .................................................................................................*......................... + // sub v24.4S, v10.4S, v31.4S // ...........................................................................*............................................... + // add v26.4S, v16.4S, v6.4S // ................................................................................................................*.......... + // mls v25.4S, v12.4S, v8.S[0] // ..........................................................................................................*................ + // sub v30.4S, v16.4S, v6.4S // .........................................................................................................*................. + // mls v1.4S, v22.4S, v8.S[0] // ................................................................................................*.......................... + // sub v20.4S, v17.4S, v4.4S // ........................................................................................................*.................. + // srshr v22.4S, v26.4S, #23 // .....................................................................................................................*..... + // mul v10.4S, v30.4S, v19.S[2] // ...............................................................................................................*........... + // sqrdmulh v16.4S, v30.4S, v19.S[3] // ..............................................................................................................*............ + // sqrdmulh v12.4S, v20.4S, v19.S[1] // ............................................................................................................*.............. + // mul v13.4S, v20.4S, v19.S[0] // ...........................................................................................................*............... + // add v20.4S, v17.4S, v4.4S // .......................................................................................................*................... + // sub v4.4S, v1.4S, v25.4S // .................................................................................................................*......... + // mls v10.4S, v16.4S, v8.S[0] // ....................................................................................................................*...... + // add v31.4S, v1.4S, v25.4S // ..................................................................................................................*........ + // mls v13.4S, v12.4S, v8.S[0] // ...................................................................................................................*....... + // mul v1.4S, v4.4S, v29.S[0] // .......................................................................................................................*... + // srshr v12.4S, v31.4S, #23 // ..........................................................................................................................* + // str q20, [x1], #(16*4) // .............................................................................................................*............. + // sqrdmulh v30.4S, v24.4S, v19.S[3] // .....................................................................................................*..................... + // sqrdmulh v20.4S, v4.4S, v29.S[1] // ......................................................................................................................*.... + // str q13, [x2], #(16*4) // ........................................................................................................................*.. + // mul v7.4S, v9.4S, v29.S[0] // .........................................................................................................................*. + // mul v4.4S, v24.4S, v19.S[2] // ..................................................................................*........................................ + + sub count, count, #1 +layer45678_start: + sqrdmulh v13.4S, v9.4S, v29.S[1] // ...........................................................................................................*............................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v26.4S, v22.4S, v8.4S // .....................................................................................................................*.................................. + mls v31.4S, v12.4S, v8.4S // .........................................................................................................................*.............................. + ldr q17, [x5, #80] // .......e................................................................................................................................................ + ldr q16, [x5, #176] // .................................e...................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v1.4S, v20.4S, v8.S[0] // .................................................................................................................*...................................... + // gap // ........................................................................................................................................................ + ldr q15, [x5], #(12*16) // ..e..................................................................................................................................................... + mls v7.4S, v13.4S, v8.S[0] // ............................................................................................................*........................................... + mls v4.4S, v30.4S, v8.S[0] // ..................................................................................................*..................................................... + sub v30.4S, v26.4S, v31.4S // ...............................................................................................................................*........................ + add v22.4S, v26.4S, v31.4S // ................................................................................................................................*....................... + ldr q24, [x5, #-64] // ..............................e......................................................................................................................... + ldr q28, [x5, #-128] // ......e................................................................................................................................................. + add v12.4S, v10.4S, v1.4S // ..........................................................................................................................................*............. + // gap // ........................................................................................................................................................ + ldr q18, [x5, #-144] // .....e.................................................................................................................................................. + sub v1.4S, v10.4S, v1.4S // .........................................................................................................................................*.............. + ldr q25, [x5, #-160] // ....e................................................................................................................................................... + add v9.4S, v4.4S, v7.4S // .....................................................................................................................................*.................. + sub v4.4S, v4.4S, v7.4S // ....................................................................................................................................*................... + str q22, [x1, #-48] // ...............................................................................................................................................*........ + str q12, [x1, #-16] // .................................................................................................................................................*...... + ldr q7, [x5, #-80] // .............................e.......................................................................................................................... + mul v20.4S, v1.4S, v19.S[0] // ...........................................................................................................................................*............ + sqrdmulh v12.4S, v1.4S, v19.S[1] // ............................................................................................................................................*........... + sqrdmulh v31.4S, v4.4S, v19.S[1] // .......................................................................................................................................*................ + mul v13.4S, v4.4S, v19.S[0] // ......................................................................................................................................*................. + str q9, [x1, #-32] // ................................................................................................................................................*....... + add x1, x1, #64 // ......................................................................................................................................................*. + ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // e....................................................................................................................................................... + sqrdmulh v22.4S, v30.4S, v19.S[1] // ..................................................................................................................................*..................... + mul v1.4S, v30.4S, v19.S[0] // .................................................................................................................................*...................... + ldr q0, [x5, #-176] // ...e.................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q30, [x5, #-48] // ...............................e........................................................................................................................ + mls v20.4S, v12.4S, v8.S[0] // .............................................................................................................................................*.......... + // gap // ........................................................................................................................................................ + ldr q27, [x5, #-32] // ................................e....................................................................................................................... + // gap // ........................................................................................................................................................ + mls v13.4S, v31.4S, v8.S[0] // ........................................................................................................................................*............... + mls v1.4S, v22.4S, v8.S[0] // ...................................................................................................................................*.................... + // gap // ........................................................................................................................................................ + ldr q29, [x4, #16] // .......................................................................e................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q20, [x2, #-16] // .....................................................................................................................................................*.. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q13, [x2, #-32] // ....................................................................................................................................................*... + sub v22.4S, v3.4S, v4.4S // ........e............................................................................................................................................... + str q1, [x2, #-48] // ...................................................................................................................................................*.... + add x2, x2, #64 // .......................................................................................................................................................* + sub v12.4S, v5.4S, v6.4S // .............e.......................................................................................................................................... + add v11.4S, v5.4S, v6.4S // ..............e......................................................................................................................................... + add v20.4S, v3.4S, v4.4S // .........e.............................................................................................................................................. + // gap // ........................................................................................................................................................ + ld4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x2] // .e...................................................................................................................................................... + mul v10.4S, v22.4S, v25.4S // ..........e............................................................................................................................................. + sqrdmulh v18.4S, v22.4S, v18.4S // ...........e............................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v26.4S, v12.4S, v17.4S // ................e....................................................................................................................................... + mul v28.4S, v12.4S, v28.4S // ...............e........................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q31, [x5, #-96] // ............................e........................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v5.4S, v20.4S, v11.4S // ...................e.................................................................................................................................... + sub v12.4S, v20.4S, v11.4S // ..................e..................................................................................................................................... + // gap // ........................................................................................................................................................ + mls v10.4S, v18.4S, v8.S[0] // ............e........................................................................................................................................... + // gap // ........................................................................................................................................................ + mls v28.4S, v26.4S, v8.S[0] // .................e...................................................................................................................................... + ldr q19, [x4], #64 // ......................................................................e................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v9.4S, v3.4S, v4.4S // .......................................e................................................................................................................ + sub v17.4S, v1.4S, v2.4S // ..................................e..................................................................................................................... + // gap // ........................................................................................................................................................ + ldr q26, [x4, #-32] // ........................................................................e............................................................................... + add v11.4S, v3.4S, v4.4S // ........................................e............................................................................................................... + add v22.4S, v1.4S, v2.4S // ...................................e.................................................................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v4.4S, v9.4S, v16.4S // ..........................................e............................................................................................................. + mul v6.4S, v12.4S, v15.4S // ....................e................................................................................................................................... + // gap // ........................................................................................................................................................ + mul v13.4S, v17.4S, v24.4S // ....................................e................................................................................................................... + sub v18.4S, v10.4S, v28.4S // .......................e................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v23.4S, v9.4S, v27.4S // .........................................e.............................................................................................................. + // gap // ........................................................................................................................................................ + sub v20.4S, v22.4S, v11.4S // ............................................e........................................................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v3.4S, v17.4S, v30.4S // .....................................e.................................................................................................................. + sqrdmulh v1.4S, v18.4S, v0.4S // ..........................e............................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v17.4S, v12.4S, v0.4S // .....................e.................................................................................................................................. + sqrdmulh v12.4S, v20.4S, v7.4S // ...............................................e........................................................................................................ + // gap // ........................................................................................................................................................ + mul v24.4S, v18.4S, v15.4S // .........................e.............................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v25.4S, v20.4S, v31.4S // ..............................................e......................................................................................................... + mls v23.4S, v4.4S, v8.S[0] // ...........................................e............................................................................................................ + mls v13.4S, v3.4S, v8.S[0] // ......................................e................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v14.4S, v10.4S, v28.4S // ........................e............................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v6.4S, v17.4S, v8.S[0] // ......................e................................................................................................................................. + mls v24.4S, v1.4S, v8.S[0] // ...........................e............................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v25.4S, v12.4S, v8.S[0] // ................................................e....................................................................................................... + sub v20.4S, v13.4S, v23.4S // .................................................e...................................................................................................... + add v27.4S, v22.4S, v11.4S // .............................................e.......................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v9.4S, v5.4S, v14.4S // .......................................................e................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v16.4S, v13.4S, v23.4S // ..................................................e..................................................................................................... + mul v31.4S, v20.4S, v31.4S // ...................................................e.................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v20.4S, v7.4S // ....................................................e................................................................................................... + // gap // ........................................................................................................................................................ + trn2 v12.4S, v6.4S, v24.4S // .........................................................e.............................................................................................. + // gap // ........................................................................................................................................................ + trn2 v13.4S, v27.4S, v16.4S // ...............................................................e........................................................................................ + trn1 v4.4S, v5.4S, v14.4S // ......................................................e................................................................................................. + trn1 v20.4S, v6.4S, v24.4S // ........................................................e............................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v31.4S, v30.4S, v8.S[0] // .....................................................e.................................................................................................. + trn2 v1.2D, v9.2D, v12.2D // ...........................................................e............................................................................................ + trn1 v22.2D, v4.2D, v20.2D // ............................................................e........................................................................................... + trn1 v17.2D, v9.2D, v12.2D // .............................................................e.......................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v4.2D, v4.2D, v20.2D // ..........................................................e............................................................................................. + trn1 v9.4S, v27.4S, v16.4S // ..............................................................e......................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v20.4S, v25.4S, v31.4S // ................................................................e....................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v16.4S, v25.4S, v31.4S // .................................................................e...................................................................................... + add v10.4S, v22.4S, v17.4S // ...........................................................................e............................................................................ + add v31.4S, v4.4S, v1.4S // ................................................................................e....................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v12.2D, v9.2D, v20.2D // ..................................................................e..................................................................................... + trn2 v30.2D, v13.2D, v16.2D // ...................................................................e.................................................................................... + // gap // ........................................................................................................................................................ + trn1 v23.2D, v9.2D, v20.2D // ....................................................................e................................................................................... + trn1 v28.2D, v13.2D, v16.2D // .....................................................................e.................................................................................. + // gap // ........................................................................................................................................................ + ldr q24, [x4, #-16] // .........................................................................e.............................................................................. + // gap // ........................................................................................................................................................ + sub v22.4S, v22.4S, v17.4S // ..........................................................................e............................................................................. + add v13.4S, v12.4S, v30.4S // ..........................................................................................e............................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v20.4S, v23.4S, v28.4S // .....................................................................................e.................................................................. + add v17.4S, v10.4S, v31.4S // ...............................................................................................e........................................................ + sub v12.4S, v12.4S, v30.4S // .........................................................................................e.............................................................. + sub v4.4S, v4.4S, v1.4S // ...............................................................................e........................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v9.4S, v20.4S, v13.4S // ........................................................................................................e............................................... + mul v16.4S, v22.4S, v29.S[2] // ............................................................................e........................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v22.4S, v29.S[3] // .............................................................................e.......................................................................... + srshr v22.4S, v17.4S, #23 // ..................................................................................................................e..................................... + // gap // ........................................................................................................................................................ + sqrdmulh v1.4S, v4.4S, v26.S[1] // ..................................................................................e..................................................................... + mul v6.4S, v4.4S, v26.S[0] // .................................................................................e...................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v4.4S, v20.4S, v13.4S // .........................................................................................................e.............................................. + mul v25.4S, v12.4S, v24.S[0] // ...........................................................................................e............................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v16.4S, v30.4S, v8.S[0] // ..............................................................................e......................................................................... + // gap // ........................................................................................................................................................ + mls v17.4S, v22.4S, v8.4S // ...................................................................................................................e.................................... + // gap // ........................................................................................................................................................ + srshr v20.4S, v4.4S, #23 // ......................................................................................................................e................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v22.4S, v23.4S, v28.4S // ....................................................................................e................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v12.4S, v12.4S, v24.S[1] // ............................................................................................e........................................................... + mls v6.4S, v1.4S, v8.S[0] // ...................................................................................e.................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v1.4S, v22.4S, v26.S[2] // ......................................................................................e................................................................. + sqrdmulh v22.4S, v22.4S, v26.S[3] // .......................................................................................e................................................................ + mls v4.4S, v20.4S, v8.4S // .......................................................................................................................e................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v24.4S, v10.4S, v31.4S // ..............................................................................................e......................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v26.4S, v16.4S, v6.4S // ....................................................................................................e................................................... + mls v25.4S, v12.4S, v8.S[0] // .............................................................................................e.......................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v30.4S, v16.4S, v6.4S // ...................................................................................................e.................................................... + mls v1.4S, v22.4S, v8.S[0] // ........................................................................................e............................................................... + sub v20.4S, v17.4S, v4.4S // ..........................................................................................................................e............................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + srshr v22.4S, v26.4S, #23 // ....................................................................................................................e................................... + mul v10.4S, v30.4S, v19.S[2] // .....................................................................................................e.................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v16.4S, v30.4S, v19.S[3] // ......................................................................................................e................................................. + sqrdmulh v12.4S, v20.4S, v19.S[1] // .............................................................................................................................e.......................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v13.4S, v20.4S, v19.S[0] // ............................................................................................................................e........................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v20.4S, v17.4S, v4.4S // ...........................................................................................................................e............................ + sub v4.4S, v1.4S, v25.4S // .............................................................................................................e.......................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v10.4S, v16.4S, v8.S[0] // .......................................................................................................e................................................ + add v31.4S, v1.4S, v25.4S // ..............................................................................................................e......................................... + mls v13.4S, v12.4S, v8.S[0] // ..............................................................................................................................e......................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v1.4S, v4.4S, v29.S[0] // ...............................................................................................................e........................................ + srshr v12.4S, v31.4S, #23 // ........................................................................................................................e............................... + str q20, [x1], #(16*4) // ..............................................................................................................................................e......... + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v24.4S, v19.S[3] // .................................................................................................e...................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v20.4S, v4.4S, v29.S[1] // ................................................................................................................e....................................... + // gap // ........................................................................................................................................................ + str q13, [x2], #(16*4) // ..................................................................................................................................................e..... + mul v7.4S, v9.4S, v29.S[0] // ..........................................................................................................e............................................. + mul v4.4S, v24.4S, v19.S[2] // ................................................................................................e....................................................... + + // original source code + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // .........................e...........................................................................................................................|...........................e.............. + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // ...........................................e.........................................................................................................|.......................................... + // ldr q0, [x5], #(12*16) // ...e.................................................................................................................................................|.....e.................................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ............................e........................................................................................................................|..............................e........... + // ldr q1, [x5, #(-12*16 + 2*16)] // .............e.......................................................................................................................................|...............e.......................... + // ldr q5, [x5, #(-12*16 + 3*16)] // ...........e.........................................................................................................................................|.............e............................ + // ldr q2, [x5, #(-12*16 + 4*16)] // .........e...........................................................................................................................................|...........e.............................. + // ldr q6, [x5, #(-12*16 + 5*16)] // e....................................................................................................................................................|..e....................................... + // sub v24.4s, v9.4s, v10.4s // .....................................e...............................................................................................................|.......................................e.. + // add v9.4s, v9.4s, v10.4s // ..........................................e..........................................................................................................|.......................................... + // mul v10.4s, v24.4s, v1.4s // ............................................e........................................................................................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .............................................e.......................................................................................................|.......................................... + // mls v10.4s, v24.4s, v8.s[0] // ...................................................e.................................................................................................|.......................................... + // sub v24.4s, v11.4s, v12.4s // ........................................e............................................................................................................|.......................................... + // add v11.4s, v11.4s, v12.4s // .........................................e...........................................................................................................|.......................................... + // mul v12.4s, v24.4s, v2.4s // ...............................................e.....................................................................................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ..............................................e......................................................................................................|.......................................... + // mls v12.4s, v24.4s, v8.s[0] // ....................................................e................................................................................................|.......................................... + // sub v24.4s, v9.4s, v11.4s // ..................................................e..................................................................................................|.......................................... + // add v9.4s, v9.4s, v11.4s // .................................................e...................................................................................................|.......................................... + // mul v11.4s, v24.4s, v0.4s // ............................................................e........................................................................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................................................e.................................................................................|.......................................... + // mls v11.4s, v24.4s, v8.s[0] // ..........................................................................e..........................................................................|.......................................... + // sub v24.4s, v10.4s, v12.4s // ..............................................................e......................................................................................|.......................................... + // add v10.4s, v10.4s, v12.4s // .........................................................................e...........................................................................|.......................................... + // mul v12.4s, v24.4s, v0.4s // .....................................................................e...............................................................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..................................................................e..................................................................................|.......................................... + // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................e.........................................................................|.......................................... + // ldr q0, [x5, #(-12*16 + 6*16)] // ................................................e....................................................................................................|.......................................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ..................e..................................................................................................................................|....................e..................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ........e............................................................................................................................................|..........e............................... + // ldr q5, [x5, #(-12*16 + 9*16)] // .............................e.......................................................................................................................|...............................e.......... + // ldr q2, [x5, #(-12*16 + 10*16)] // ...............................e.....................................................................................................................|.................................e........ + // ldr q6, [x5, #(-12*16 + 11*16)] // .e...................................................................................................................................................|...e...................................... + // sub v24.4s, v13.4s, v14.4s // .......................................................e.............................................................................................|.......................................... + // add v13.4s, v13.4s, v14.4s // ..........................................................e..........................................................................................|.......................................... + // mul v14.4s, v24.4s, v1.4s // .............................................................e.......................................................................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .................................................................e...................................................................................|.......................................... + // mls v14.4s, v24.4s, v8.s[0] // ........................................................................e............................................................................|.......................................... + // sub v24.4s, v15.4s, v16.4s // ......................................................e..............................................................................................|.......................................... + // add v15.4s, v15.4s, v16.4s // .........................................................e...........................................................................................|.......................................... + // mul v16.4s, v24.4s, v2.4s // ...............................................................e.....................................................................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ...........................................................e.........................................................................................|.......................................... + // mls v16.4s, v24.4s, v8.s[0] // .......................................................................e.............................................................................|.......................................... + // sub v24.4s, v13.4s, v15.4s // ................................................................e....................................................................................|.......................................... + // add v13.4s, v13.4s, v15.4s // ..............................................................................e......................................................................|.......................................... + // mul v15.4s, v24.4s, v0.4s // ......................................................................e..............................................................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ....................................................................e................................................................................|.......................................... + // mls v15.4s, v24.4s, v8.s[0] // ............................................................................e........................................................................|.......................................... + // sub v24.4s, v14.4s, v16.4s // .............................................................................e.......................................................................|.......................................... + // add v14.4s, v14.4s, v16.4s // ................................................................................e....................................................................|.......................................... + // mul v16.4s, v24.4s, v0.4s // .................................................................................e...................................................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..................................................................................e..................................................................|.......................................... + // mls v16.4s, v24.4s, v8.s[0] // .......................................................................................e.............................................................|.......................................... + // trn1 v25.4s, v9.4s, v10.4s // .....................................................................................e...............................................................|.......................................... + // trn2 v26.4s, v9.4s, v10.4s // ...............................................................................e.....................................................................|.......................................... + // trn1 v27.4s, v11.4s, v12.4s // ......................................................................................e..............................................................|.......................................... + // trn2 v28.4s, v11.4s, v12.4s // ...................................................................................e.................................................................|.......................................... + // trn2 v11.2d, v25.2d, v27.2d // ...........................................................................................e.........................................................|.......................................... + // trn2 v12.2d, v26.2d, v28.2d // ........................................................................................e............................................................|.......................................... + // trn1 v9.2d, v25.2d, v27.2d // .........................................................................................e...........................................................|.......................................... + // trn1 v10.2d, v26.2d, v28.2d // ..........................................................................................e..........................................................|.......................................... + // trn1 v25.4s, v13.4s, v14.4s // ............................................................................................e........................................................|.......................................... + // trn2 v26.4s, v13.4s, v14.4s // ....................................................................................e................................................................|.......................................... + // trn1 v27.4s, v15.4s, v16.4s // .............................................................................................e.......................................................|.......................................... + // trn2 v28.4s, v15.4s, v16.4s // ..............................................................................................e......................................................|.......................................... + // trn2 v15.2d, v25.2d, v27.2d // .................................................................................................e...................................................|.......................................... + // trn2 v16.2d, v26.2d, v28.2d // ..................................................................................................e..................................................|.......................................... + // trn1 v13.2d, v25.2d, v27.2d // ...................................................................................................e.................................................|.......................................... + // trn1 v14.2d, v26.2d, v28.2d // ....................................................................................................e................................................|.......................................... + // ldr q0, [x4], #64 // .....................................................e...............................................................................................|.......................................... + // ldr q1, [x4, #(-64 + 16)] // ..................................e..................................................................................................................|....................................e..... + // ldr q2, [x4, #(-64 + 32)] // ........................................................e............................................................................................|.......................................... + // ldr q3, [x4, #(-64 + 48)] // .....................................................................................................e...............................................|.......................................... + // sub v24.4s, v9.4s, v10.4s // ......................................................................................................e..............................................|.......................................... + // add v9.4s, v9.4s, v10.4s // ...............................................................................................e.....................................................|.......................................... + // mul v10.4s, v24.4s, v1.s[2] // .............................................................................................................e.......................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..............................................................................................................e......................................|.......................................... + // mls v10.4s, v24.4s, v8.s[0] // ....................................................................................................................e................................|.......................................... + // sub v24.4s, v11.4s, v12.4s // ...........................................................................................................e.........................................|.......................................... + // add v11.4s, v11.4s, v12.4s // ................................................................................................e....................................................|.......................................... + // mul v12.4s, v24.4s, v2.s[0] // .................................................................................................................e...................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................................................................................................................e....................................|.......................................... + // mls v12.4s, v24.4s, v8.s[0] // .........................................................................................................................e...........................|.......................................... + // sub v24.4s, v13.4s, v14.4s // .......................................................................................................................e.............................|.......................................... + // add v13.4s, v13.4s, v14.4s // ........................................................................................................e............................................|.......................................... + // mul v14.4s, v24.4s, v2.s[2] // ..........................................................................................................................e..........................|.......................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...........................................................................................................................e.........................|.......................................... + // mls v14.4s, v24.4s, v8.s[0] // .................................................................................................................................e...................|.......................................... + // sub v24.4s, v15.4s, v16.4s // ..........................................................................................................e..........................................|.......................................... + // add v15.4s, v15.4s, v16.4s // .......................................................................................................e.............................................|.......................................... + // mul v16.4s, v24.4s, v3.s[0] // ...................................................................................................................e.................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................................................................................................e............................|.......................................... + // mls v16.4s, v24.4s, v8.s[0] // ...............................................................................................................................e.....................|.......................................... + // sub v24.4s, v9.4s, v11.4s // .............................................................................................................................e.......................|.......................................... + // add v9.4s, v9.4s, v11.4s // .........................................................................................................e...........................................|.......................................... + // mul v11.4s, v24.4s, v0.s[2] // ....................................................................................................................................................e|.......................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................................................................................................................................................e....|.......................................... + // mls v11.4s, v24.4s, v8.s[0] // .....*...............................................................................................................................................|.......*.................................. + // sub v24.4s, v10.4s, v12.4s // ................................................................................................................................e....................|.......................................... + // add v10.4s, v10.4s, v12.4s // ..............................................................................................................................e......................|.......................................... + // mul v12.4s, v24.4s, v0.s[2] // ....................................................................................................................................e................|.......................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....................................................................................................................................e...............|.......................................... + // mls v12.4s, v24.4s, v8.s[0] // ..........................................................................................................................................e..........|.......................................... + // sub v24.4s, v13.4s, v15.4s // ............................................................................................................e........................................|.......................................... + // add v13.4s, v13.4s, v15.4s // ..................................................................................................................e..................................|.......................................... + // mul v15.4s, v24.4s, v1.s[0] // ...................................................................................................................................................e.|.......................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .....................................................................................................................................................*.......................................... + // mls v15.4s, v24.4s, v8.s[0] // ....*................................................................................................................................................|......*................................... + // sub v24.4s, v14.4s, v16.4s // .........................................................................................................................................e...........|.......................................... + // add v14.4s, v14.4s, v16.4s // ...........................................................................................................................................e.........|.......................................... + // mul v16.4s, v24.4s, v1.s[0] // .............................................................................................................................................e.......|.......................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .................................................................................................................................................e...|.......................................... + // mls v16.4s, v24.4s, v8.s[0] // ..*..................................................................................................................................................|....*..................................... + // srshr v24.4S, v9.4S, #23 // ...............................................................................................................e.....................................|.......................................... + // mls v9.4s, v24.4s, v8.4s // .....................................................................................................................e...............................|.......................................... + // srshr v24.4S, v10.4S, #23 // ...................................................................................................................................e.................|.......................................... + // mls v10.4s, v24.4s, v8.4s // .....................................................................................................................................................|*......................................... + // srshr v24.4S, v13.4S, #23 // ......................................................................................................................e..............................|.......................................... + // mls v13.4s, v24.4s, v8.4s // ............................................................................................................................e........................|.......................................... + // srshr v24.4S, v14.4S, #23 // ..............................................................................................................................................e......|.......................................... + // mls v14.4s, v24.4s, v8.4s // .....................................................................................................................................................|.*........................................ + // sub v24.4s, v9.4s, v13.4s // ..................................................................................................................................e..................|.......................................... + // add v9.4s, v9.4s, v13.4s // ........................................................................................................................................e............|.......................................... + // mul v13.4s, v24.4s, v0.s[0] // .......................................................................................................................................e.............|.......................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................................................................................e..............|.......................................... + // mls v13.4s, v24.4s, v8.s[0] // ............................................................................................................................................e........|.......................................... + // sub v24.4s, v10.4s, v14.4s // ......*..............................................................................................................................................|........*................................. + // add v10.4s, v10.4s, v14.4s // .......*.............................................................................................................................................|.........*................................ + // mul v14.4s, v24.4s, v0.s[0] // ...........................*.........................................................................................................................|.............................*............ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........................*..........................................................................................................................|............................*............. + // mls v14.4s, v24.4s, v8.s[0] // .................................*...................................................................................................................|...................................*...... + // sub v24.4s, v11.4s, v15.4s // ...............*.....................................................................................................................................|.................*........................ + // add v11.4s, v11.4s, v15.4s // ..............*......................................................................................................................................|................*......................... + // mul v15.4s, v24.4s, v0.s[0] // ......................*..............................................................................................................................|........................*................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................*...............................................................................................................................|.......................*.................. + // mls v15.4s, v24.4s, v8.s[0] // ................................*....................................................................................................................|..................................*....... + // sub v24.4s, v12.4s, v16.4s // ............*........................................................................................................................................|..............*........................... + // add v12.4s, v12.4s, v16.4s // ..........*..........................................................................................................................................|............*............................. + // mul v16.4s, v24.4s, v0.s[0] // ...................*.................................................................................................................................|.....................*.................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ....................*................................................................................................................................|......................*................... + // mls v16.4s, v24.4s, v8.s[0] // ..............................*......................................................................................................................|................................*......... + // str q9, [x1], #(16*4) // ...............................................................................................................................................e.....|.......................................... + // str q10, [x1, #(-16*4 + 1*16)] // ................*....................................................................................................................................|..................*....................... + // str q11, [x1, #(-16*4 + 2*16)] // .......................*.............................................................................................................................|.........................*................ + // str q12, [x1, #(-16*4 + 3*16)] // .................*...................................................................................................................................|...................*...................... + // str q13, [x2], #(16*4) // ..................................................................................................................................................e..|.......................................... + // str q14, [x2, #(-16*4 + 1*16)] // ......................................*..............................................................................................................|........................................*. + // str q15, [x2, #(-16*4 + 2*16)] // ....................................*................................................................................................................|......................................*... + // str q16, [x2, #(-16*4 + 3*16)] // ...................................*.................................................................................................................|.....................................*.... + // add x1, x1, #64 // ........................*............................................................................................................................|..........................*............... + // add x2, x2, #64 // .......................................*.............................................................................................................|.........................................* + + sub count, count, #1 + cbnz count, layer45678_start + sqrdmulh v13.4S, v9.4S, v29.S[1] // *............................ + mls v1.4S, v20.4S, v8.S[0] // ...*......................... + // gap // ............................. + // gap // ............................. + mls v26.4S, v22.4S, v8.4S // .*........................... + mls v31.4S, v12.4S, v8.4S // ..*.......................... + // gap // ............................. + // gap // ............................. + mls v4.4S, v30.4S, v8.S[0] // .....*....................... + // gap // ............................. + // gap // ............................. + // gap // ............................. + mls v7.4S, v13.4S, v8.S[0] // ....*........................ + sub v18.4S, v10.4S, v1.4S // .........*................... + // gap // ............................. + // gap // ............................. + sub v28.4S, v26.4S, v31.4S // ......*...................... + add v12.4S, v10.4S, v1.4S // ........*.................... + // gap // ............................. + // gap // ............................. + add v2.4S, v26.4S, v31.4S // .......*..................... + mul v1.4S, v18.4S, v19.S[0] // ..............*.............. + // gap // ............................. + // gap // ............................. + sub v22.4S, v4.4S, v7.4S // ...........*................. + str q12, [x1, #-16] // .............*............... + sqrdmulh v13.4S, v18.4S, v19.S[1] // ...............*............. + // gap // ............................. + sqrdmulh v20.4S, v28.4S, v19.S[1] // ....................*........ + mul v12.4S, v28.4S, v19.S[0] // .....................*....... + str q2, [x1, #-48] // ............*................ + // gap // ............................. + mul v18.4S, v22.4S, v19.S[0] // .................*........... + sqrdmulh v22.4S, v22.4S, v19.S[1] // ................*............ + // gap // ............................. + // gap // ............................. + add v7.4S, v4.4S, v7.4S // ..........*.................. + mls v1.4S, v13.4S, v8.S[0] // ......................*...... + // gap // ............................. + // gap // ............................. + mls v12.4S, v20.4S, v8.S[0] // ........................*.... + // gap // ............................. + // gap // ............................. + // gap // ............................. + str q7, [x1, #-32] // ..................*.......... + add x1, x1, #64 // ...................*......... + mls v18.4S, v22.4S, v8.S[0] // .......................*..... + // gap // ............................. + str q1, [x2, #-16] // .........................*... + // gap // ............................. + // gap // ............................. + // gap // ............................. + str q12, [x2, #-48] // ...........................*. + // gap // ............................. + // gap // ............................. + // gap // ............................. + str q18, [x2, #-32] // ..........................*.. + add x2, x2, #64 // ............................* + // gap // ............................. + // gap // ............................. + + // original source code + // sqrdmulh v13.4S, v9.4S, v29.S[1] // *............................ + // mls v26.4S, v22.4S, v8.4S // ..*.......................... + // mls v31.4S, v12.4S, v8.4S // ...*......................... + // mls v1.4S, v20.4S, v8.S[0] // .*........................... + // mls v7.4S, v13.4S, v8.S[0] // .....*....................... + // mls v4.4S, v30.4S, v8.S[0] // ....*........................ + // sub v30.4S, v26.4S, v31.4S // .......*..................... + // add v22.4S, v26.4S, v31.4S // .........*................... + // add v12.4S, v10.4S, v1.4S // ........*.................... + // sub v1.4S, v10.4S, v1.4S // ......*...................... + // add v9.4S, v4.4S, v7.4S // ...................*......... + // sub v4.4S, v4.4S, v7.4S // ...........*................. + // str q22, [x1, #-48] // ................*............ + // str q12, [x1, #-16] // ............*................ + // mul v20.4S, v1.4S, v19.S[0] // ..........*.................. + // sqrdmulh v12.4S, v1.4S, v19.S[1] // .............*............... + // sqrdmulh v31.4S, v4.4S, v19.S[1] // ..................*.......... + // mul v13.4S, v4.4S, v19.S[0] // .................*........... + // str q9, [x1, #-32] // ......................*...... + // add x1, x1, #64 // .......................*..... + // sqrdmulh v22.4S, v30.4S, v19.S[1] // ..............*.............. + // mul v1.4S, v30.4S, v19.S[0] // ...............*............. + // mls v20.4S, v12.4S, v8.S[0] // ....................*........ + // mls v13.4S, v31.4S, v8.S[0] // ........................*.... + // mls v1.4S, v22.4S, v8.S[0] // .....................*....... + // str q20, [x2, #-16] // .........................*... + // str q13, [x2, #-32] // ...........................*. + // str q1, [x2, #-48] // ..........................*.. + // add x2, x2, #64 // ............................* + + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, consts.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + ldr q15, [x0, #0] // *................. + // gap // .................. + // gap // .................. + ldr q5, [x0, #128] // .*................ + ldr q9, [x0, #256] // ....*............. + ldr q17, [x0, #384] // .....*............ + // gap // .................. + // gap // .................. + ldr q10, [x0, #512] // ......*........... + // gap // .................. + // gap // .................. + // gap // .................. + ldr q23, [x0, #768] // ........*......... + // gap // .................. + // gap // .................. + // gap // .................. + sub v27.4S, v15.4S, v5.4S // ..*............... + ldr q6, [x0, #896] // .........*........ + // gap // .................. + // gap // .................. + add v14.4S, v15.4S, v5.4S // ...............*.. + ldr q18, [x0, #640] // .......*.......... + // gap // .................. + sub v12.4S, v9.4S, v17.4S // ..........*....... + sqrdmulh v5.4S, v27.4S, v1.S[3] // ...........*...... + mul v16.4S, v27.4S, v1.S[2] // ...*.............. + // gap // .................. + // gap // .................. + add v27.4S, v9.4S, v17.4S // ..............*... + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + add v28.4S, v23.4S, v6.4S // .................* + mul v13.4S, v12.4S, v2.S[0] // .............*.... + mls v16.4S, v5.4S, v8.S[0] // ................*. + // gap // .................. + // gap // .................. + sub v17.4S, v10.4S, v18.4S // ............*..... + + // original source code + // ldr q11, [x0, #0] // *................. + // ldr q7, [x0, #128] // .*................ + // sub v19.4S, v11.4S, v7.4S // ......*........... + // mul v16.4S, v19.4S, v1.S[2] // ............*..... + // ldr q24, [x0, #256] // ..*............... + // ldr q14, [x0, #384] // ...*.............. + // ldr q10, [x0, #512] // ....*............. + // ldr q18, [x0, #640] // .........*........ + // ldr q23, [x0, #768] // .....*............ + // ldr q6, [x0, #896] // .......*.......... + // sub v12.4S, v24.4S, v14.4S // ..........*....... + // sqrdmulh v22.4S, v19.4S, v1.S[3] // ...........*...... + // sub v17.4S, v10.4S, v18.4S // .................* + // mul v13.4S, v12.4S, v2.S[0] // ...............*.. + // add v27.4S, v24.4S, v14.4S // .............*.... + // add v14.4S, v11.4S, v7.4S // ........*......... + // mls v16.4S, v22.4S, v8.S[0] // ................*. + // add v28.4S, v23.4S, v6.4S // ..............*... + + sub count, count, #1 +layer123_start: + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v7.4S, v12.4S, v2.S[1] // ................*....................................................................................................... + sub v9.4S, v23.4S, v6.4S // .......................*................................................................................................ + // gap // ........................................................................................................................ + mul v24.4S, v17.4S, v2.S[2] // ....................*................................................................................................... + sqrdmulh v17.4S, v17.4S, v2.S[3] // .....................*.................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v20.4S, v9.4S, v3.S[0] // .........................*.............................................................................................. + add v15.4S, v10.4S, v18.4S // ...................*.................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v13.4S, v7.4S, v8.S[0] // .................*...................................................................................................... + sub v23.4S, v14.4S, v27.4S // ............................*........................................................................................... + ldr q11, [x0, #16] // e....................................................................................................................... + ldr q7, [x0, #144] // .e...................................................................................................................... + mls v24.4S, v17.4S, v8.S[0] // ......................*................................................................................................. + sqrdmulh v18.4S, v9.4S, v3.S[1] // ..........................*............................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v22.4S, v23.4S, v0.S[2] // ..............................*......................................................................................... + sqrdmulh v5.4S, v23.4S, v0.S[3] // ...............................*........................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v12.4S, v15.4S, v28.4S // ......................................*................................................................................. + sub v17.4S, v16.4S, v13.4S // .................................*...................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v20.4S, v18.4S, v8.S[0] // ...........................*............................................................................................ + add v10.4S, v14.4S, v27.4S // .............................*.......................................................................................... + // gap // ........................................................................................................................ + mls v22.4S, v5.4S, v8.S[0] // ................................*....................................................................................... + // gap // ........................................................................................................................ + sub v19.4S, v11.4S, v7.4S // ........e............................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v9.4S, v17.4S, v0.S[2] // ...................................*.................................................................................... + sqrdmulh v18.4S, v17.4S, v0.S[3] // ....................................*................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v23.4S, v12.4S, v1.S[0] // ........................................*............................................................................... + sqrdmulh v29.4S, v12.4S, v1.S[1] // .........................................*.............................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v14.4S, v24.4S, v20.4S // ...........................................*............................................................................ + add v4.4S, v16.4S, v13.4S // ..................................*..................................................................................... + mul v16.4S, v19.4S, v1.S[2] // ..........e............................................................................................................. + // gap // ........................................................................................................................ + mls v9.4S, v18.4S, v8.S[0] // .....................................*.................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v23.4S, v29.4S, v8.S[0] // ..........................................*............................................................................. + add v18.4S, v24.4S, v20.4S // ............................................*........................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v20.4S, v15.4S, v28.4S // .......................................*................................................................................ + mul v13.4S, v14.4S, v1.S[0] // .............................................*.......................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v15.4S, v14.4S, v1.S[1] // ..............................................*......................................................................... + sub v17.4S, v4.4S, v18.4S // .....................................................*.................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v14.4S, v10.4S, v20.4S // .................................................*...................................................................... + sub v29.4S, v10.4S, v20.4S // ................................................*....................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v12.4S, v17.4S, v0.S[0] // .......................................................*................................................................ + sqrdmulh v20.4S, v17.4S, v0.S[1] // ........................................................*............................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v27.4S, v4.4S, v18.4S // ......................................................*................................................................. + mul v18.4S, v29.4S, v0.S[0] // ..................................................*..................................................................... + // gap // ........................................................................................................................ + mls v13.4S, v15.4S, v8.S[0] // ...............................................*........................................................................ + // gap // ........................................................................................................................ + sqrdmulh v24.4S, v29.4S, v0.S[1] // ...................................................*.................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v12.4S, v20.4S, v8.S[0] // .........................................................*.............................................................. + sub v4.4S, v22.4S, v23.4S // ..........................................................*............................................................. + // gap // ........................................................................................................................ + sqrdmulh v5.4S, v14.4S, v26.4S // .........................................................................................*.............................. + // gap // ........................................................................................................................ + add v17.4S, v22.4S, v23.4S // ...........................................................*............................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v20.4S, v4.4S, v0.S[0] // ............................................................*........................................................... + sub v28.4S, v9.4S, v13.4S // ...............................................................*........................................................ + // gap // ........................................................................................................................ + mls v18.4S, v24.4S, v8.S[0] // ....................................................*................................................................... + // gap // ........................................................................................................................ + add v23.4S, v9.4S, v13.4S // ................................................................*....................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v6.4S, v4.4S, v0.S[1] // .............................................................*.......................................................... + mul v22.4S, v28.4S, v0.S[0] // .................................................................*...................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v29.4S, v17.4S, v26.4S // ...............................................................................................*........................ + mul v13.4S, v14.4S, v25.4S // ........................................................................................*............................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v15.4S, v31.4S, v18.4S // ....................................................................*................................................... + cmge v21.4S, v18.4S, v30.4S // .....................................................................*.................................................. + // gap // ........................................................................................................................ + mls v20.4S, v6.4S, v8.S[0] // ..............................................................*......................................................... + // gap // ........................................................................................................................ + sqrdmulh v9.4S, v28.4S, v0.S[1] // ..................................................................*..................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v28.4S, v23.4S, v26.4S // ..................................................................................................*..................... + sub v24.4S, v15.4S, v21.4S // ......................................................................*................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v4.4S, v12.4S, v30.4S // .........................................................................*.............................................. + cmge v6.4S, v31.4S, v12.4S // ........................................................................*............................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v21.4S, v31.4S, v20.4S // ............................................................................*........................................... + cmge v10.4S, v20.4S, v30.4S // .............................................................................*.......................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v22.4S, v9.4S, v8.S[0] // ...................................................................*.................................................... + mls v18.4S, v24.4S, v8.4S // .......................................................................*................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v15.4S, v6.4S, v4.4S // ..........................................................................*............................................. + sub v21.4S, v21.4S, v10.4S // ..............................................................................*......................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v4.4S, v27.4S, v25.4S // ...........................................................................................*............................ + sqrdmulh v14.4S, v27.4S, v26.4S // ............................................................................................*........................... + cmge v24.4S, v22.4S, v30.4S // .................................................................................*...................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v9.4S, v17.4S, v25.4S // ..............................................................................................*......................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v6.4S, v31.4S, v22.4S // ................................................................................*....................................... + mls v20.4S, v21.4S, v8.4S // ...............................................................................*........................................ + mls v4.4S, v14.4S, v8.S[0] // .............................................................................................*.......................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v13.4S, v5.4S, v8.S[0] // ..........................................................................................*............................. + // gap // ........................................................................................................................ + mls v12.4S, v15.4S, v8.4S // ...........................................................................*............................................ + mls v9.4S, v29.4S, v8.S[0] // ................................................................................................*....................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q20, [x0, #768] // ......................................................................................*................................. + mul v20.4S, v23.4S, v25.4S // .................................................................................................*...................... + sub v23.4S, v6.4S, v24.4S // ..................................................................................*..................................... + cmge v29.4S, v31.4S, v13.4S // ....................................................................................................*................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v21.4S, v13.4S, v30.4S // .....................................................................................................*.................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v15.4S, v31.4S, v4.4S // ........................................................................................................*............... + cmge v5.4S, v4.4S, v30.4S // .........................................................................................................*.............. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v20.4S, v28.4S, v8.S[0] // ...................................................................................................*.................... + sub v29.4S, v29.4S, v21.4S // ......................................................................................................*................. + // gap // ........................................................................................................................ + ldr q24, [x0, #272] // ..e..................................................................................................................... + mls v22.4S, v23.4S, v8.4S // ...................................................................................*.................................... + sub v21.4S, v15.4S, v5.4S // ..........................................................................................................*............. + ldr q14, [x0, #400] // ...e.................................................................................................................... + str q18, [x0, #512] // ....................................................................................*................................... + cmge v27.4S, v31.4S, v9.4S // ............................................................................................................*........... + cmge v28.4S, v9.4S, v30.4S // .............................................................................................................*.......... + ldr q10, [x0, #528] // ....e................................................................................................................... + ldr q18, [x0, #656] // .....e.................................................................................................................. + cmge v17.4S, v20.4S, v30.4S // .................................................................................................................*...... + cmge v6.4S, v31.4S, v20.4S // ................................................................................................................*....... + sub v15.4S, v27.4S, v28.4S // ..............................................................................................................*......... + str q12, [x0, #640] // .....................................................................................*.................................. + mls v13.4S, v29.4S, v8.4S // .......................................................................................................*................ + // gap // ........................................................................................................................ + mls v4.4S, v21.4S, v8.4S // ...........................................................................................................*............ + ldr q23, [x0, #784] // ......e................................................................................................................. + // gap // ........................................................................................................................ + sub v5.4S, v6.4S, v17.4S // ..................................................................................................................*..... + mls v9.4S, v15.4S, v8.4S // ...............................................................................................................*........ + ldr q6, [x0, #912] // .......e................................................................................................................ + str q22, [x0, #896] // .......................................................................................*................................ + sub v12.4S, v24.4S, v14.4S // .............e.......................................................................................................... + sqrdmulh v22.4S, v19.4S, v1.S[3] // ...........e............................................................................................................ + // gap // ........................................................................................................................ + mls v20.4S, v5.4S, v8.4S // ...................................................................................................................*.... + str q13, [x0], #(16) // ....................................................................................................................*... + sub v17.4S, v10.4S, v18.4S // ..................e..................................................................................................... + mul v13.4S, v12.4S, v2.S[0] // ...............e........................................................................................................ + // gap // ........................................................................................................................ + str q4, [x0, #112] // .....................................................................................................................*.. + add v27.4S, v24.4S, v14.4S // ..............e......................................................................................................... + add v14.4S, v11.4S, v7.4S // .........e.............................................................................................................. + // gap // ........................................................................................................................ + str q9, [x0, #240] // ......................................................................................................................*. + mls v16.4S, v22.4S, v8.S[0] // ............e........................................................................................................... + // gap // ........................................................................................................................ + str q20, [x0, #368] // .......................................................................................................................* + add v28.4S, v23.4S, v6.4S // ........................e............................................................................................... + + // original source code + // ldr q9, [x0, #0] // e...............................................................................................................|.......e.............................................................................................................. + // ldr q10, [x0, #(1*(1024/8))] // .e..............................................................................................................|........e............................................................................................................. + // ldr q11, [x0, #(2*(1024/8))] // ...............................................................................e................................|......................................................................................e............................... + // ldr q12, [x0, #(3*(1024/8))] // ..................................................................................e.............................|.........................................................................................e............................ + // ldr q13, [x0, #(4*(1024/8))] // ......................................................................................e.........................|.............................................................................................e........................ + // ldr q14, [x0, #(5*(1024/8))] // .......................................................................................e........................|..............................................................................................e....................... + // ldr q15, [x0, #(6*(1024/8))] // ..............................................................................................e.................|.....................................................................................................e................ + // ldr q16, [x0, #(7*(1024/8))] // .................................................................................................e..............|........................................................................................................e............. + // sub v24.4s, v9.4s, v10.4s // ...........e....................................................................................................|..................e................................................................................................... + // add v9.4s, v9.4s, v10.4s // ...........................................................................................................e....|..................................................................................................................e... + // mul v10.4s, v24.4s, v1.s[2] // ..................e.............................................................................................|.........................e............................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ....................................................................................................e...........|...........................................................................................................e.......... + // mls v10.4s, v24.4s, v8.s[0] // .............................................................................................................e..|....................................................................................................................e. + // sub v24.4s, v11.4s, v12.4s // ...................................................................................................e............|..........................................................................................................e........... + // add v11.4s, v11.4s, v12.4s // ..........................................................................................................e.....|.................................................................................................................e.... + // mul v12.4s, v24.4s, v2.s[0] // ........................................................................................................e.......|...............................................................................................................e...... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................................................................................................................*...................................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ................................................................................................................|.....*................................................................................................................ + // sub v24.4s, v13.4s, v14.4s // .......................................................................................................e........|..............................................................................................................e....... + // add v13.4s, v13.4s, v14.4s // ................................................................................................................|....*................................................................................................................. + // mul v14.4s, v24.4s, v2.s[2] // ................................................................................................................|.*.................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................................................................................................................|..*................................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ..*.............................................................................................................|.........*............................................................................................................ + // sub v24.4s, v15.4s, v16.4s // ................................................................................................................|*..................................................................................................................... + // add v15.4s, v15.4s, v16.4s // ...............................................................................................................e|...................................................................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ................................................................................................................|...*.................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...*............................................................................................................|..........*........................................................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ........*.......................................................................................................|...............*...................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ................................................................................................................|......*............................................................................................................... + // add v9.4s, v9.4s, v11.4s // .........*......................................................................................................|................*..................................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ....*...........................................................................................................|...........*.......................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....*..........................................................................................................|............*......................................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ..........*.....................................................................................................|.................*.................................................................................................... + // sub v24.4s, v10.4s, v12.4s // .......*........................................................................................................|..............*....................................................................................................... + // add v10.4s, v10.4s, v12.4s // .................*..............................................................................................|........................*............................................................................................. + // mul v12.4s, v24.4s, v0.s[2] // ............*...................................................................................................|...................*.................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............*..................................................................................................|....................*................................................................................................. + // mls v12.4s, v24.4s, v8.s[0] // ...................*............................................................................................|..........................*........................................................................................... + // sub v24.4s, v13.4s, v15.4s // ......*.........................................................................................................|.............*........................................................................................................ + // add v13.4s, v13.4s, v15.4s // ......................*.........................................................................................|.............................*........................................................................................ + // mul v15.4s, v24.4s, v1.s[0] // ..............*.................................................................................................|.....................*................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...............*................................................................................................|......................*............................................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ....................*...........................................................................................|...........................*.......................................................................................... + // sub v24.4s, v14.4s, v16.4s // ................*...............................................................................................|.......................*.............................................................................................. + // add v14.4s, v14.4s, v16.4s // .....................*..........................................................................................|............................*......................................................................................... + // mul v16.4s, v24.4s, v1.s[0] // .......................*........................................................................................|..............................*....................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................*.......................................................................................|...............................*...................................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ................................*...............................................................................|.......................................*.............................................................................. + // sub v24.4s, v9.4s, v13.4s // ...........................*....................................................................................|..................................*................................................................................... + // add v9.4s, v9.4s, v13.4s // ..........................*.....................................................................................|.................................*.................................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ...............................*................................................................................|......................................*............................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................*..............................................................................|........................................*............................................................................. + // mls v13.4s, v24.4s, v8.s[0] // ........................................*.......................................................................|...............................................*...................................................................... + // sub v24.4s, v10.4s, v14.4s // .........................*......................................................................................|................................*..................................................................................... + // add v10.4s, v10.4s, v14.4s // ..............................*.................................................................................|.....................................*................................................................................ + // mul v14.4s, v24.4s, v0.s[0] // ............................*...................................................................................|...................................*.................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................*..................................................................................|....................................*................................................................................. + // mls v14.4s, v24.4s, v8.s[0] // ..................................*.............................................................................|.........................................*............................................................................ + // sub v24.4s, v11.4s, v15.4s // ...................................*............................................................................|..........................................*........................................................................... + // add v11.4s, v11.4s, v15.4s // .....................................*..........................................................................|............................................*......................................................................... + // mul v15.4s, v24.4s, v0.s[0] // ......................................*.........................................................................|.............................................*........................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........................................*.....................................................................|.................................................*.................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ................................................*...............................................................|.......................................................*.............................................................. + // sub v24.4s, v12.4s, v16.4s // .......................................*........................................................................|..............................................*....................................................................... + // add v12.4s, v12.4s, v16.4s // .........................................*......................................................................|................................................*..................................................................... + // mul v16.4s, v24.4s, v0.s[0] // ...........................................*....................................................................|..................................................*................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................................*..............................................................|........................................................*............................................................. + // mls v16.4s, v24.4s, v8.s[0] // ........................................................*.......................................................|...............................................................*...................................................... + // cmge v27.4s, v31.4s, v13.4s // ..............................................*.................................................................|.....................................................*................................................................ + // cmge v28.4s, v13.4s, v30.4s // ...............................................*................................................................|......................................................*............................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................*............................................................|..........................................................*........................................................... + // mls v13.4s, v28.4s, v8.4s // .........................................................*......................................................|................................................................*..................................................... + // cmge v27.4s, v31.4s, v14.4s // .....................................................*..........................................................|............................................................*......................................................... + // cmge v28.4s, v14.4s, v30.4s // ....................................................*...........................................................|...........................................................*.......................................................... + // sub v28.4s, v27.4s, v28.4s // ..........................................................*.....................................................|.................................................................*.................................................... + // mls v14.4s, v28.4s, v8.4s // ....................................................................*...........................................|...........................................................................*.......................................... + // cmge v27.4s, v31.4s, v15.4s // ......................................................*.........................................................|.............................................................*........................................................ + // cmge v28.4s, v15.4s, v30.4s // .......................................................*........................................................|..............................................................*....................................................... + // sub v28.4s, v27.4s, v28.4s // ...........................................................*....................................................|..................................................................*................................................... + // mls v15.4s, v28.4s, v8.4s // .................................................................*..............................................|........................................................................*............................................. + // cmge v27.4s, v31.4s, v16.4s // ................................................................*...............................................|.......................................................................*.............................................. + // cmge v28.4s, v16.4s, v30.4s // ..............................................................*.................................................|.....................................................................*................................................ + // sub v28.4s, v27.4s, v28.4s // ........................................................................*.......................................|...............................................................................*...................................... + // mls v16.4s, v28.4s, v8.4s // ................................................................................*...............................|.......................................................................................*.............................. + // str q13, [x0, #(4*(1024/8))] // ...................................................................................*............................|..........................................................................................*........................... + // str q14, [x0, #(5*(1024/8))] // ...........................................................................................*....................|..................................................................................................*................... + // str q15, [x0, #(6*(1024/8))] // ......................................................................*.........................................|.............................................................................*........................................ + // str q16, [x0, #(7*(1024/8))] // ..................................................................................................*.............|.........................................................................................................*............ + // mul v13.4s, v9.4s, v25.4s // .............................................*..................................................................|....................................................*................................................................. + // sqrdmulh v9.4s, v9.4s, v26.4s // ....................................*...........................................................................|...........................................*.......................................................................... + // mls v13.4s, v9.4s, v8.s[0] // ...................................................................*............................................|..........................................................................*........................................... + // mul v14.4s, v10.4s, v25.4s // ............................................................*...................................................|...................................................................*.................................................. + // sqrdmulh v10.4s, v10.4s, v26.4s // .............................................................*..................................................|....................................................................*................................................. + // mls v14.4s, v10.4s, v8.s[0] // ..................................................................*.............................................|.........................................................................*............................................ + // mul v15.4s, v11.4s, v25.4s // ...............................................................*................................................|......................................................................*............................................... + // sqrdmulh v11.4s, v11.4s, v26.4s // ............................................*...................................................................|...................................................*.................................................................. + // mls v15.4s, v11.4s, v8.s[0] // .....................................................................*..........................................|............................................................................*......................................... + // mul v16.4s, v12.4s, v25.4s // .......................................................................*........................................|..............................................................................*....................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ..................................................*.............................................................|.........................................................*............................................................ + // mls v16.4s, v12.4s, v8.s[0] // .............................................................................*..................................|....................................................................................*................................. + // cmge v27.4s, v31.4s, v13.4s // .........................................................................*......................................|................................................................................*..................................... + // cmge v28.4s, v13.4s, v30.4s // ..........................................................................*.....................................|.................................................................................*.................................... + // sub v28.4s, v27.4s, v28.4s // ..............................................................................*.................................|.....................................................................................*................................ + // mls v13.4s, v28.4s, v8.4s // ............................................................................................*...................|...................................................................................................*.................. + // cmge v27.4s, v31.4s, v14.4s // ...........................................................................*....................................|..................................................................................*................................... + // cmge v28.4s, v14.4s, v30.4s // ............................................................................*...................................|...................................................................................*.................................. + // sub v28.4s, v27.4s, v28.4s // .................................................................................*..............................|........................................................................................*............................. + // mls v14.4s, v28.4s, v8.4s // .............................................................................................*..................|....................................................................................................*................. + // cmge v27.4s, v31.4s, v15.4s // ....................................................................................*...........................|...........................................................................................*.......................... + // cmge v28.4s, v15.4s, v30.4s // .....................................................................................*..........................|............................................................................................*......................... + // sub v28.4s, v27.4s, v28.4s // ..........................................................................................*.....................|.................................................................................................*.................... + // mls v15.4s, v28.4s, v8.4s // ................................................................................................*...............|.......................................................................................................*.............. + // cmge v27.4s, v31.4s, v16.4s // .........................................................................................*......................|................................................................................................*..................... + // cmge v28.4s, v16.4s, v30.4s // ........................................................................................*.......................|...............................................................................................*...................... + // sub v28.4s, v27.4s, v28.4s // ...............................................................................................*................|......................................................................................................*............... + // mls v16.4s, v28.4s, v8.4s // .....................................................................................................*..........|............................................................................................................*......... + // str q13, [x0], #(16) // ......................................................................................................*.........|.............................................................................................................*........ + // str q14, [x0, #(-16 + 1*(1024/8))] // .........................................................................................................*......|................................................................................................................*..... + // str q15, [x0, #(-16 + 2*(1024/8))] // ............................................................................................................*...|...................................................................................................................*.. + // str q16, [x0, #(-16 + 3*(1024/8))] // ..............................................................................................................*.|.....................................................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sqrdmulh v12.4S, v12.4S, v2.S[1] // *..................................................................................................... + sub v23.4S, v23.4S, v6.4S // .*.................................................................................................... + sqrdmulh v5.4S, v17.4S, v2.S[3] // ...*.................................................................................................. + mul v7.4S, v17.4S, v2.S[2] // ..*................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sqrdmulh v20.4S, v23.4S, v3.S[1] // .........*............................................................................................ + // gap // ...................................................................................................... + mul v17.4S, v23.4S, v3.S[0] // ....*................................................................................................. + add v9.4S, v14.4S, v27.4S // ...............*...................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mls v13.4S, v12.4S, v8.S[0] // ......*............................................................................................... + add v10.4S, v10.4S, v18.4S // .....*................................................................................................ + // gap // ...................................................................................................... + mls v7.4S, v5.4S, v8.S[0] // ........*............................................................................................. + // gap // ...................................................................................................... + sub v23.4S, v14.4S, v27.4S // .......*.............................................................................................. + // gap // ...................................................................................................... + mls v17.4S, v20.4S, v8.S[0] // ..............*....................................................................................... + // gap // ...................................................................................................... + sub v27.4S, v10.4S, v28.4S // ............*......................................................................................... + sub v11.4S, v16.4S, v13.4S // .............*........................................................................................ + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sqrdmulh v12.4S, v23.4S, v0.S[3] // ...........*.......................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + add v13.4S, v16.4S, v13.4S // ......................*............................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mul v5.4S, v11.4S, v0.S[2] // .................*.................................................................................... + sub v20.4S, v7.4S, v17.4S // .....................*................................................................................ + mul v23.4S, v23.4S, v0.S[2] // ..........*........................................................................................... + sqrdmulh v15.4S, v11.4S, v0.S[3] // ..................*................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mul v29.4S, v20.4S, v1.S[0] // ...........................*.......................................................................... + // gap // ...................................................................................................... + sqrdmulh v19.4S, v20.4S, v1.S[1] // ............................*......................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + add v17.4S, v7.4S, v17.4S // .........................*............................................................................ + // gap // ...................................................................................................... + mul v4.4S, v27.4S, v1.S[0] // ...................*.................................................................................. + sqrdmulh v11.4S, v27.4S, v1.S[1] // ....................*................................................................................. + mls v5.4S, v15.4S, v8.S[0] // .......................*.............................................................................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + add v15.4S, v13.4S, v17.4S // ..................................*................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mls v29.4S, v19.4S, v8.S[0] // ....................................*................................................................. + add v22.4S, v10.4S, v28.4S // ..........................*........................................................................... + mls v23.4S, v12.4S, v8.S[0] // ................*..................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sub v13.4S, v13.4S, v17.4S // .............................*........................................................................ + mls v4.4S, v11.4S, v8.S[0] // ........................*............................................................................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sqrdmulh v11.4S, v15.4S, v26.4S // .................................................................*.................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + add v24.4S, v5.4S, v29.4S // .............................................*........................................................ + add v7.4S, v9.4S, v22.4S // ..............................*....................................................................... + sub v27.4S, v9.4S, v22.4S // ...............................*...................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mul v10.4S, v24.4S, v25.4S // ...........................................................................*.......................... + sqrdmulh v20.4S, v24.4S, v26.4S // ......................................................*............................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mul v17.4S, v7.4S, v25.4S // .................................................*.................................................... + sub v22.4S, v23.4S, v4.4S // .......................................*.............................................................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sqrdmulh v12.4S, v7.4S, v26.4S // ........................................*............................................................. + mul v6.4S, v27.4S, v0.S[0] // ...................................*.................................................................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sqrdmulh v9.4S, v22.4S, v0.S[1] // ..............................................*....................................................... + // gap // ...................................................................................................... + mls v10.4S, v20.4S, v8.S[0] // .................................................................................*.................... + // gap // ...................................................................................................... + mul v14.4S, v22.4S, v0.S[0] // ..........................................*........................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sub v5.4S, v5.4S, v29.4S // ...........................................*.......................................................... + sqrdmulh v16.4S, v27.4S, v0.S[1] // .....................................*................................................................ + mls v17.4S, v12.4S, v8.S[0] // .......................................................................*.............................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + cmge v12.4S, v31.4S, v10.4S // .........................................................................................*............ + cmge v20.4S, v10.4S, v30.4S // ........................................................................................*............. + mls v14.4S, v9.4S, v8.S[0] // ....................................................*................................................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sqrdmulh v24.4S, v5.4S, v0.S[1] // .....................................................*................................................ + cmge v22.4S, v31.4S, v17.4S // .............................................................................*........................ + // gap // ...................................................................................................... + sub v12.4S, v12.4S, v20.4S // ..............................................................................................*....... + // gap // ...................................................................................................... + cmge v20.4S, v17.4S, v30.4S // ..............................................................................*....................... + add v7.4S, v23.4S, v4.4S // .........................................*............................................................ + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mls v10.4S, v12.4S, v8.4S // .................................................................................................*.... + cmge v23.4S, v31.4S, v14.4S // ..........................................................*........................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mls v6.4S, v16.4S, v8.S[0] // ............................................*......................................................... + cmge v29.4S, v14.4S, v30.4S // ...........................................................*.......................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sub v4.4S, v22.4S, v20.4S // ..................................................................................*................... + mul v18.4S, v7.4S, v25.4S // ...................................................................*.................................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + str q10, [x0, #384] // .....................................................................................................* + mul v20.4S, v15.4S, v25.4S // ................................................................*..................................... + // gap // ...................................................................................................... + sub v28.4S, v23.4S, v29.4S // ...............................................................*...................................... + cmge v9.4S, v31.4S, v6.4S // ..................................................*................................................... + mul v16.4S, v5.4S, v0.S[0] // ...............................................*...................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mls v17.4S, v4.4S, v8.4S // ............................................................................................*......... + cmge v19.4S, v6.4S, v30.4S // ...................................................*.................................................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sqrdmulh v12.4S, v7.4S, v26.4S // ................................................*..................................................... + // gap // ...................................................................................................... + mls v20.4S, v11.4S, v8.S[0] // ......................................................................*............................... + // gap // ...................................................................................................... + sub v22.4S, v9.4S, v19.4S // .......................................................*.............................................. + sqrdmulh v9.4S, v13.4S, v0.S[1] // .................................*.................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mls v14.4S, v28.4S, v8.4S // .....................................................................*................................ + mul v27.4S, v13.4S, v0.S[0] // ................................*..................................................................... + str q17, [x0], #(16) // ..................................................................................................*... + // gap // ...................................................................................................... + mls v6.4S, v22.4S, v8.4S // .............................................................*........................................ + mls v18.4S, v12.4S, v8.S[0] // .........................................................................*............................ + // gap // ...................................................................................................... + // gap // ...................................................................................................... + cmge v4.4S, v31.4S, v20.4S // ...............................................................................*...................... + mls v16.4S, v24.4S, v8.S[0] // ............................................................*......................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + cmge v13.4S, v20.4S, v30.4S // ................................................................................*..................... + mls v27.4S, v9.4S, v8.S[0] // ......................................*............................................................... + str q14, [x0, #752] // ..........................................................................*........................... + // gap // ...................................................................................................... + cmge v22.4S, v31.4S, v18.4S // ......................................................................................*............... + str q6, [x0, #496] // .....................................................................................*................ + cmge v12.4S, v18.4S, v30.4S // .......................................................................................*.............. + // gap // ...................................................................................................... + sub v13.4S, v4.4S, v13.4S // ....................................................................................*................. + cmge v21.4S, v31.4S, v16.4S // ....................................................................*................................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + cmge v15.4S, v16.4S, v30.4S // ..................................................................*................................... + cmge v4.4S, v31.4S, v27.4S // .........................................................*............................................ + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + cmge v24.4S, v27.4S, v30.4S // ........................................................*............................................. + sub v12.4S, v22.4S, v12.4S // ..........................................................................................*........... + mls v20.4S, v13.4S, v8.4S // .............................................................................................*........ + sub v23.4S, v21.4S, v15.4S // ............................................................................*......................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mls v18.4S, v12.4S, v8.4S // ...............................................................................................*...... + sub v24.4S, v4.4S, v24.4S // ..............................................................*....................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mls v16.4S, v23.4S, v8.4S // ...................................................................................*.................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + str q20, [x0, #112] // ...................................................................................................*.. + mls v27.4S, v24.4S, v8.4S // ........................................................................*............................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + str q18, [x0, #240] // ....................................................................................................*. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + str q16, [x0, #880] // ................................................................................................*..... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + str q27, [x0, #624] // ...........................................................................................*.......... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + + // original source code + // sqrdmulh v7.4S, v12.4S, v2.S[1] // *..................................................................................................... + // sub v9.4S, v23.4S, v6.4S // .*.................................................................................................... + // mul v24.4S, v17.4S, v2.S[2] // ...*.................................................................................................. + // sqrdmulh v17.4S, v17.4S, v2.S[3] // ..*................................................................................................... + // mul v20.4S, v9.4S, v3.S[0] // .....*................................................................................................ + // add v15.4S, v10.4S, v18.4S // ........*............................................................................................. + // mls v13.4S, v7.4S, v8.S[0] // .......*.............................................................................................. + // sub v23.4S, v14.4S, v27.4S // ..........*........................................................................................... + // mls v24.4S, v17.4S, v8.S[0] // .........*............................................................................................ + // sqrdmulh v18.4S, v9.4S, v3.S[1] // ....*................................................................................................. + // mul v22.4S, v23.4S, v0.S[2] // ..................*................................................................................... + // sqrdmulh v5.4S, v23.4S, v0.S[3] // ..............*....................................................................................... + // sub v12.4S, v15.4S, v28.4S // ............*......................................................................................... + // sub v17.4S, v16.4S, v13.4S // .............*........................................................................................ + // mls v20.4S, v18.4S, v8.S[0] // ...........*.......................................................................................... + // add v10.4S, v14.4S, v27.4S // ......*............................................................................................... + // mls v22.4S, v5.4S, v8.S[0] // .............................*........................................................................ + // mul v9.4S, v17.4S, v0.S[2] // ................*..................................................................................... + // sqrdmulh v18.4S, v17.4S, v0.S[3] // ...................*.................................................................................. + // mul v23.4S, v12.4S, v1.S[0] // .......................*.............................................................................. + // sqrdmulh v29.4S, v12.4S, v1.S[1] // ........................*............................................................................. + // sub v14.4S, v24.4S, v20.4S // .................*.................................................................................... + // add v4.4S, v16.4S, v13.4S // ...............*...................................................................................... + // mls v9.4S, v18.4S, v8.S[0] // .........................*............................................................................ + // mls v23.4S, v29.4S, v8.S[0] // ...............................*...................................................................... + // add v18.4S, v24.4S, v20.4S // ......................*............................................................................... + // add v20.4S, v15.4S, v28.4S // ............................*......................................................................... + // mul v13.4S, v14.4S, v1.S[0] // ....................*................................................................................. + // sqrdmulh v15.4S, v14.4S, v1.S[1] // .....................*................................................................................ + // sub v17.4S, v4.4S, v18.4S // ..............................*....................................................................... + // add v14.4S, v10.4S, v20.4S // ..................................*................................................................... + // sub v29.4S, v10.4S, v20.4S // ...................................*.................................................................. + // mul v12.4S, v17.4S, v0.S[0] // ..........................................................................*........................... + // sqrdmulh v20.4S, v17.4S, v0.S[1] // ........................................................................*............................. + // add v27.4S, v4.4S, v18.4S // ..........................*........................................................................... + // mul v18.4S, v29.4S, v0.S[0] // .........................................*............................................................ + // mls v13.4S, v15.4S, v8.S[0] // ...........................*.......................................................................... + // sqrdmulh v24.4S, v29.4S, v0.S[1] // ..............................................*....................................................... + // mls v12.4S, v20.4S, v8.S[0] // .................................................................................*.................... + // sub v4.4S, v22.4S, v23.4S // .......................................*.............................................................. + // sqrdmulh v5.4S, v14.4S, v26.4S // ........................................*............................................................. + // add v17.4S, v22.4S, v23.4S // .......................................................*.............................................. + // mul v20.4S, v4.4S, v0.S[0] // ............................................*......................................................... + // sub v28.4S, v9.4S, v13.4S // .............................................*........................................................ + // mls v18.4S, v24.4S, v8.S[0] // ..........................................................*........................................... + // add v23.4S, v9.4S, v13.4S // .................................*.................................................................... + // sqrdmulh v6.4S, v4.4S, v0.S[1] // ..........................................*........................................................... + // mul v22.4S, v28.4S, v0.S[0] // ..................................................................*................................... + // sqrdmulh v29.4S, v17.4S, v26.4S // .....................................................................*................................ + // mul v13.4S, v14.4S, v25.4S // ......................................*............................................................... + // cmge v15.4S, v31.4S, v18.4S // .................................................................*.................................... + // cmge v21.4S, v18.4S, v30.4S // ....................................................................*................................. + // mls v20.4S, v6.4S, v8.S[0] // ..................................................*................................................... + // sqrdmulh v9.4S, v28.4S, v0.S[1] // ...................................................*.................................................. + // sqrdmulh v28.4S, v23.4S, v26.4S // .....................................*................................................................ + // sub v24.4S, v15.4S, v21.4S // .......................................................................*.............................. + // cmge v4.4S, v12.4S, v30.4S // ..........................................................................................*........... + // cmge v6.4S, v31.4S, v12.4S // .........................................................................................*............ + // cmge v21.4S, v31.4S, v20.4S // .........................................................*............................................ + // cmge v10.4S, v20.4S, v30.4S // ...........................................................*.......................................... + // mls v22.4S, v9.4S, v8.S[0] // ...............................................................................*...................... + // mls v18.4S, v24.4S, v8.4S // ............................................................................*......................... + // sub v15.4S, v6.4S, v4.4S // ...............................................................................................*...... + // sub v21.4S, v21.4S, v10.4S // ................................................................*..................................... + // mul v4.4S, v27.4S, v25.4S // ...............................................................*...................................... + // sqrdmulh v14.4S, v27.4S, v26.4S // ................................*..................................................................... + // cmge v24.4S, v22.4S, v30.4S // ........................................................................................*............. + // mul v9.4S, v17.4S, v25.4S // .............................................................*........................................ + // cmge v6.4S, v31.4S, v22.4S // .......................................................................................*.............. + // mls v20.4S, v21.4S, v8.4S // .........................................................................*............................ + // mls v4.4S, v14.4S, v8.S[0] // ......................................................................*............................... + // mls v13.4S, v5.4S, v8.S[0] // ...............................................*...................................................... + // mls v12.4S, v15.4S, v8.4S // ..................................................................................................*... + // mls v9.4S, v29.4S, v8.S[0] // .............................................................................*........................ + // str q20, [x0, #768] // ..................................................................................*................... + // mul v20.4S, v23.4S, v25.4S // ....................................*................................................................. + // sub v23.4S, v6.4S, v24.4S // .............................................................................................*........ + // cmge v29.4S, v31.4S, v13.4S // ....................................................*................................................. + // cmge v21.4S, v13.4S, v30.4S // ......................................................*............................................... + // cmge v15.4S, v31.4S, v4.4S // ..............................................................................*....................... + // cmge v5.4S, v4.4S, v30.4S // ................................................................................*..................... + // mls v20.4S, v28.4S, v8.S[0] // ...........................................*.......................................................... + // sub v29.4S, v29.4S, v21.4S // ............................................................*......................................... + // mls v22.4S, v23.4S, v8.4S // ................................................................................................*..... + // sub v21.4S, v15.4S, v5.4S // ......................................................................................*............... + // str q18, [x0, #512] // ....................................................................................*................. + // cmge v27.4S, v31.4S, v9.4S // ...................................................................................*.................. + // cmge v28.4S, v9.4S, v30.4S // .....................................................................................*................ + // cmge v17.4S, v20.4S, v30.4S // .................................................*.................................................... + // cmge v6.4S, v31.4S, v20.4S // ................................................*..................................................... + // sub v15.4S, v27.4S, v28.4S // ...........................................................................................*.......... + // str q12, [x0, #640] // .....................................................................................................* + // mls v13.4S, v29.4S, v8.4S // ...................................................................*.................................. + // mls v4.4S, v21.4S, v8.4S // ............................................................................................*......... + // sub v5.4S, v6.4S, v17.4S // .....................................................*................................................ + // mls v9.4S, v15.4S, v8.4S // ..............................................................................................*....... + // str q22, [x0, #896] // ....................................................................................................*. + // mls v20.4S, v5.4S, v8.4S // ........................................................*............................................. + // str q13, [x0], #(16) // ...........................................................................*.......................... + // str q4, [x0, #112] // .................................................................................................*.... + // str q9, [x0, #240] // ...................................................................................................*.. + // str q20, [x0, #368] // ..............................................................*....................................... + + + pop_stack + ret \ No newline at end of file diff --git a/tests/ntt_dilithium/manual/ntt_dilithium_1234_5678.s b/tests/ntt_dilithium/manual/ntt_dilithium_1234_5678.s index ae63345..2709e63 100644 --- a/tests/ntt_dilithium/manual/ntt_dilithium_1234_5678.s +++ b/tests/ntt_dilithium/manual/ntt_dilithium_1234_5678.s @@ -126,15 +126,15 @@ .endm .macro transpose4 data - trn1 t0.4s, \data\()0\().4s, \data\()1\().4s - trn2 t1.4s, \data\()0\().4s, \data\()1\().4s - trn1 t2.4s, \data\()2\().4s, \data\()3\().4s - trn2 t3.4s, \data\()2\().4s, \data\()3\().4s - - trn2 \data\()2\().2d, t0.2d, t2.2d - trn2 \data\()3\().2d, t1.2d, t3.2d - trn1 \data\()0\().2d, t0.2d, t2.2d - trn1 \data\()1\().2d, t1.2d, t3.2d + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d .endm .macro save_gprs // @slothy:no-unfold diff --git a/tests/ntt_dilithium/manual/ntt_dilithium_123_45678.s b/tests/ntt_dilithium/manual/ntt_dilithium_123_45678.s index cb85b7f..d0a2b94 100644 --- a/tests/ntt_dilithium/manual/ntt_dilithium_123_45678.s +++ b/tests/ntt_dilithium/manual/ntt_dilithium_123_45678.s @@ -186,7 +186,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -197,7 +197,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -207,7 +207,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -215,7 +215,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -226,19 +226,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs diff --git a/tests/ntt_dilithium/manual/ntt_dilithium_123_45678_manual_st4.s b/tests/ntt_dilithium/manual/ntt_dilithium_123_45678_manual_st4.s index 8bc6322..0b31709 100644 --- a/tests/ntt_dilithium/manual/ntt_dilithium_123_45678_manual_st4.s +++ b/tests/ntt_dilithium/manual/ntt_dilithium_123_45678_manual_st4.s @@ -186,7 +186,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -197,7 +197,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -207,7 +207,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -215,7 +215,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -226,19 +226,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs diff --git a/tests/ntt_dilithium/manual/ntt_dilithium_123_45678_w_scalar.s b/tests/ntt_dilithium/manual/ntt_dilithium_123_45678_w_scalar.s index 479d11b..3924934 100644 --- a/tests/ntt_dilithium/manual/ntt_dilithium_123_45678_w_scalar.s +++ b/tests/ntt_dilithium/manual/ntt_dilithium_123_45678_w_scalar.s @@ -196,7 +196,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -207,7 +207,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -217,7 +217,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -225,7 +225,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -236,19 +236,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs diff --git a/tests/ntt_kyber/main.c b/tests/ntt_kyber/main.c index 1ef71e8..0b61998 100644 --- a/tests/ntt_kyber/main.c +++ b/tests/ntt_kyber/main.c @@ -377,6 +377,7 @@ MAKE_TEST_FWD(asm_123_4567_inv_manual_ld4, 1, intt_kyber_123_4567_manual_ld4, in // (both results are not additionally reduced for comparison reasons) MAKE_TEST_FWD(asm_vs_pqclean_123_4567_inv, 1, intt_kyber_123_4567, pqclean_invntt,0,1,0) MAKE_TEST_FWD(asm_vs_pqclean_123_4567_inv_manual_ld4, 1, intt_kyber_123_4567_manual_ld4, pqclean_invntt,0,1,0) + // A55 MAKE_TEST_FWD(asm_123_4567_manual_st4_opt_a55, 0, ntt_kyber_123_4567_manual_st4_opt_a55, ntt_ct,0,1,1) MAKE_TEST_FWD(asm_123_4567_opt_a55, 0, ntt_kyber_123_4567_opt_a55, ntt_ct,0,1,1) diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567.s b/tests/ntt_kyber/manual/intt_kyber_123_4567.s index 6dda0d2..77029b5 100644 --- a/tests/ntt_kyber/manual/intt_kyber_123_4567.s +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567.s @@ -381,6 +381,8 @@ layer4567_start: barrett_reduce data0 barrett_reduce data2 + barrett_reduce data1 + barrett_reduce data3 // Layer 4 gs_butterfly data0, data2, root0, 0, 1 @@ -431,9 +433,6 @@ layer123_start: gs_butterfly data4, data6, root0, 4, 5 gs_butterfly data5, data7, root0, 4, 5 - barrett_reduce data0 - barrett_reduce data4 - gs_butterfly data0, data4, root0, 0, 1 gs_butterfly data1, data5, root0, 0, 1 gs_butterfly data2, data6, root0, 0, 1 diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4.s index 0285f66..4f8df6a 100644 --- a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4.s +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4.s @@ -375,7 +375,9 @@ layer4567_start: gs_butterfly data2, data3, root0, 4, 5 barrett_reduce data0 + barrett_reduce data1 barrett_reduce data2 + barrett_reduce data3 // Layer 4 gs_butterfly data0, data2, root0, 0, 1 @@ -426,9 +428,6 @@ layer123_start: gs_butterfly data4, data6, root0, 4, 5 gs_butterfly data5, data7, root0, 4, 5 - barrett_reduce data0 - barrett_reduce data4 - gs_butterfly data0, data4, root0, 0, 1 gs_butterfly data1, data5, root0, 0, 1 gs_butterfly data2, data6, root0, 0, 1 diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_a55.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_a55.s index 1795f87..3b1ae53 100644 --- a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_a55.s +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_a55.s @@ -354,500 +354,536 @@ _intt_kyber_123_4567_manual_ld4_opt_a55: mov count, #8 .p2align 2 - ld4 {v26.4S, v27.4S, v28.4S, v29.4S}, [x1] // ..*.............................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - ldr q2, [x4, #48] // ........*........................................ - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - sub v16.8H, v28.8H, v29.8H // .....*........................................... - // gap // ................................................. - sub v21.8H, v26.8H, v27.8H // ....*............................................ - // gap // ................................................. - ldr q23, [x4, #64] // ...*............................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - sqrdmulh v0.8H, v21.8H, v2.8H // ...........*..................................... - // gap // ................................................. - ldr q2, [x4, #80] // ......*.......................................... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - ldr q20, [x4, #32] // .*............................................... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - sqrdmulh v2.8H, v16.8H, v2.8H // .........*....................................... - // gap // ................................................. - mul v25.8H, v16.8H, v23.8H // .......*......................................... - // gap // ................................................. - mul v4.8H, v21.8H, v20.8H // ..........*...................................... - // gap // ................................................. - add v17.8H, v26.8H, v27.8H // .............*................................... - // gap // ................................................. - add v30.8H, v28.8H, v29.8H // ............*.................................... - // gap // ................................................. - mls v25.8H, v2.8H, v7.H[0] // ..............*.................................. - // gap // ................................................. - mls v4.8H, v0.8H, v7.H[0] // ...............*................................. - // gap // ................................................. - sub v21.8H, v17.8H, v30.8H // ................*................................ - // gap // ................................................. - ldr q2, [x4, #16] // *................................................ - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - sub v0.8H, v4.8H, v25.8H // ..................*.............................. - // gap // ................................................. - ldr q23, [x4], #(6*16) // .................*............................... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - sqrdmulh v16.8H, v21.8H, v2.8H // ......................*.......................... - // gap // ................................................. - sqrdmulh v2.8H, v0.8H, v2.8H // .......................*......................... - // gap // ................................................. - mul v26.8H, v0.8H, v23.8H // ....................*............................ - // gap // ................................................. - mul v20.8H, v21.8H, v23.8H // ...................*............................. - // gap // ................................................. - add v0.8H, v4.8H, v25.8H // ........................*........................ - // gap // ................................................. - add v21.8H, v17.8H, v30.8H // .....................*........................... - // gap // ................................................. - mls v26.8H, v2.8H, v7.H[0] // ............................*.................... - // gap // ................................................. - mls v20.8H, v16.8H, v7.H[0] // .........................*....................... - // gap // ................................................. - trn1 v23.4S, v21.4S, v0.4S // ..........................*...................... - // gap // ................................................. - trn2 v16.4S, v21.4S, v0.4S // ...........................*..................... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - trn2 v21.4S, v20.4S, v26.4S // ..............................*.................. - // gap // ................................................. - trn1 v2.4S, v20.4S, v26.4S // .............................*................... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - trn1 v0.2D, v16.2D, v21.2D // ..................................*.............. - // gap // ................................................. - trn2 v26.2D, v16.2D, v21.2D // ................................*................ - // gap // ................................................. - trn2 v20.2D, v23.2D, v2.2D // ...............................*................. - // gap // ................................................. - trn1 v2.2D, v23.2D, v2.2D // .................................*............... - // gap // ................................................. - add v23.8H, v20.8H, v26.8H // .....................................*........... - // gap // ................................................. - add v21.8H, v2.8H, v0.8H // ....................................*............ - // gap // ................................................. - sub v0.8H, v2.8H, v0.8H // ........................................*........ - // gap // ................................................. - sqdmulh v2.8H, v23.8H, v7.H[1] // .......................................*......... - // gap // ................................................. - sqdmulh v16.8H, v21.8H, v7.H[1] // ......................................*.......... - // gap // ................................................. - ldr q11, [x3], #16 // .............................................*... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - srshr v2.8H, v2.8H, #11 // ..........................................*...... - // gap // ................................................. - srshr v16.8H, v16.8H, #11 // .........................................*....... - // gap // ................................................. - sub v15.8H, v20.8H, v26.8H // ...................................*............. - // gap // ................................................. - mls v23.8H, v2.8H, v7.H[0] // ............................................*.... - // gap // ................................................. - mls v21.8H, v16.8H, v7.H[0] // ...........................................*..... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - add v2.8H, v21.8H, v23.8H // ..............................................*.. - // gap // ................................................. - sub v31.8H, v21.8H, v23.8H // ...............................................*. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - str q2, [x1], #(64) // ................................................* - // gap // ................................................. + ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // .*........................................................ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + ldr q22, [x4, #64] // ..*....................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v19.8H, v5.8H, v6.8H // ...*...................................................... + // gap // .......................................................... + ldr q23, [x4, #80] // ...........*.............................................. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v29.8H, v19.8H, v22.8H // ....*..................................................... + // gap // .......................................................... + ldr q26, [x4, #48] // .....*.................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v0.8H, v3.8H, v4.8H // .......*.................................................. + // gap // .......................................................... + ldr q28, [x4, #32] // .........*................................................ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v19.8H, v19.8H, v23.8H // .............*............................................ + // gap // .......................................................... + sqrdmulh v22.8H, v0.8H, v26.8H // ..........*............................................... + // gap // .......................................................... + mul v24.8H, v0.8H, v28.8H // ............*............................................. + // gap // .......................................................... + add v27.8H, v5.8H, v6.8H // ...............*.......................................... + // gap // .......................................................... + mls v29.8H, v19.8H, v7.H[0] // ................*......................................... + // gap // .......................................................... + add v3.8H, v3.8H, v4.8H // ........*................................................. + // gap // .......................................................... + mls v24.8H, v22.8H, v7.H[0] // ..............*........................................... + // gap // .......................................................... + ldr q23, [x4, #16] // *......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v28.8H, v3.8H, v27.8H // .................*........................................ + // gap // .......................................................... + sub v22.8H, v24.8H, v29.8H // ...................*...................................... + // gap // .......................................................... + ldr q0, [x4], #(6*16) // ......*................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v19.8H, v22.8H, v23.8H // .......................*.................................. + // gap // .......................................................... + sqrdmulh v23.8H, v28.8H, v23.8H // .....................*.................................... + // gap // .......................................................... + mul v22.8H, v22.8H, v0.8H // ......................*................................... + // gap // .......................................................... + mul v0.8H, v28.8H, v0.8H // ....................*..................................... + // gap // .......................................................... + add v28.8H, v24.8H, v29.8H // ........................*................................. + // gap // .......................................................... + add v27.8H, v3.8H, v27.8H // ..................*....................................... + // gap // .......................................................... + mls v22.8H, v19.8H, v7.H[0] // ..........................*............................... + // gap // .......................................................... + mls v0.8H, v23.8H, v7.H[0] // .........................*................................ + // gap // .......................................................... + trn1 v23.4S, v27.4S, v28.4S // ..............................*........................... + // gap // .......................................................... + ldr q11, [x3], #16 // ............................*............................. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + trn1 v19.4S, v0.4S, v22.4S // ...............................*.......................... + // gap // .......................................................... + trn2 v22.4S, v0.4S, v22.4S // .............................*............................ + // gap // .......................................................... + trn2 v28.4S, v27.4S, v28.4S // ...........................*.............................. + // gap // .......................................................... + trn1 v0.2D, v23.2D, v19.2D // ...................................*...................... + // gap // .......................................................... + trn2 v19.2D, v23.2D, v19.2D // ..................................*....................... + // gap // .......................................................... + trn1 v3.2D, v28.2D, v22.2D // .................................*........................ + // gap // .......................................................... + trn2 v23.2D, v28.2D, v22.2D // ................................*......................... + // gap // .......................................................... + add v27.8H, v0.8H, v3.8H // .....................................*.................... + // gap // .......................................................... + add v24.8H, v19.8H, v23.8H // ....................................*..................... + // gap // .......................................................... + sub v23.8H, v19.8H, v23.8H // ...........................................*.............. + // gap // .......................................................... + sqdmulh v22.8H, v27.8H, v7.H[1] // .......................................*.................. + // gap // .......................................................... + sqdmulh v28.8H, v24.8H, v7.H[1] // ......................................*................... + // gap // .......................................................... + sqrdmulh v19.8H, v23.8H, v11.H[5] // ..................................................*....... + // gap // .......................................................... + mul v8.8H, v23.8H, v11.H[4] // ................................................*......... + // gap // .......................................................... + srshr v23.8H, v22.8H, #11 // ..........................................*............... + // gap // .......................................................... + srshr v22.8H, v28.8H, #11 // .........................................*................ + // gap // .......................................................... + sub v28.8H, v0.8H, v3.8H // ........................................*................. + // gap // .......................................................... + mls v27.8H, v23.8H, v7.H[0] // .............................................*............ + // gap // .......................................................... + mls v24.8H, v22.8H, v7.H[0] // ............................................*............. + // gap // .......................................................... + mls v8.8H, v19.8H, v7.H[0] // .......................................................*.. + // gap // .......................................................... + sqrdmulh v0.8H, v28.8H, v11.H[3] // ..............................................*........... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v22.8H, v27.8H, v24.8H // .................................................*........ + // gap // .......................................................... + add v23.8H, v27.8H, v24.8H // ...................................................*...... + // gap // .......................................................... + mul v28.8H, v28.8H, v11.H[2] // ...............................................*.......... + // gap // .......................................................... + sqrdmulh v19.8H, v22.8H, v11.H[1] // .....................................................*.... + // gap // .......................................................... + mul v27.8H, v22.8H, v11.H[0] // ....................................................*..... + // gap // .......................................................... + str q23, [x1], #(64) // ........................................................*. + // gap // .......................................................... + mls v28.8H, v0.8H, v7.H[0] // ......................................................*... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mls v27.8H, v19.8H, v7.H[0] // .........................................................* + // gap // .......................................................... // original source code - // ldr q13, [x4, #16] // ................*................................ - // ldr q10, [x4, #32] // .......*......................................... - // ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // *................................................ - // ldr q28, [x4, #64] // ....*............................................ - // sub v30.8H, v3.8H, v4.8H // ...*............................................. - // sub v16.8H, v5.8H, v6.8H // ..*.............................................. - // ldr q21, [x4, #80] // ......*.......................................... - // mul v27.8H, v16.8H, v28.8H // .........*....................................... - // ldr q20, [x4, #48] // .*............................................... - // sqrdmulh v17.8H, v16.8H, v21.8H // ........*........................................ - // mul v16.8H, v30.8H, v10.8H // ..........*...................................... - // sqrdmulh v20.8H, v30.8H, v20.8H // .....*........................................... - // add v8.8H, v5.8H, v6.8H // ............*.................................... - // add v4.8H, v3.8H, v4.8H // ...........*..................................... - // mls v27.8H, v17.8H, v7.H[0] // .............*................................... - // mls v16.8H, v20.8H, v7.H[0] // ..............*.................................. - // sub v26.8H, v4.8H, v8.8H // ...............*................................. - // ldr q25, [x4], #(6*16) // ..................*.............................. - // sub v12.8H, v16.8H, v27.8H // .................*............................... - // mul v24.8H, v26.8H, v25.8H // ......................*.......................... - // mul v14.8H, v12.8H, v25.8H // .....................*........................... - // add v20.8H, v4.8H, v8.8H // ........................*........................ - // sqrdmulh v26.8H, v26.8H, v13.8H // ...................*............................. - // sqrdmulh v0.8H, v12.8H, v13.8H // ....................*............................ - // add v16.8H, v16.8H, v27.8H // .......................*......................... - // mls v24.8H, v26.8H, v7.H[0] // ..........................*...................... - // trn1 v2.4S, v20.4S, v16.4S // ...........................*..................... - // trn2 v28.4S, v20.4S, v16.4S // ............................*.................... - // mls v14.8H, v0.8H, v7.H[0] // .........................*....................... - // trn1 v23.4S, v24.4S, v14.4S // ..............................*.................. - // trn2 v0.4S, v24.4S, v14.4S // .............................*................... - // trn2 v21.2D, v2.2D, v23.2D // .................................*............... - // trn2 v29.2D, v28.2D, v0.2D // ................................*................ - // trn1 v2.2D, v2.2D, v23.2D // ..................................*.............. - // trn1 v0.2D, v28.2D, v0.2D // ...............................*................. - // sub v15.8H, v21.8H, v29.8H // ...........................................*..... - // add v23.8H, v2.8H, v0.8H // ....................................*............ - // add v16.8H, v21.8H, v29.8H // ...................................*............. - // sqdmulh v17.8H, v23.8H, v7.H[1] // .......................................*......... - // sqdmulh v20.8H, v16.8H, v7.H[1] // ......................................*.......... - // sub v0.8H, v2.8H, v0.8H // .....................................*........... - // srshr v17.8H, v17.8H, #11 // ..........................................*...... - // srshr v20.8H, v20.8H, #11 // .........................................*....... - // mls v23.8H, v17.8H, v7.H[0] // .............................................*... - // mls v16.8H, v20.8H, v7.H[0] // ............................................*.... - // ldr q11, [x3], #16 // ........................................*........ - // add v20.8H, v23.8H, v16.8H // ..............................................*.. - // sub v31.8H, v23.8H, v16.8H // ...............................................*. - // str q20, [x1], #(64) // ................................................* + // ldr q22, [x4, #16] // ...............*.......................................... + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x1] // *......................................................... + // ldr q24, [x4, #64] // .*........................................................ + // sub v29.8H, v15.8H, v16.8H // ..*....................................................... + // mul v18.8H, v29.8H, v24.8H // ....*..................................................... + // ldr q27, [x4, #48] // .....*.................................................... + // ldr q19, [x4], #(6*16) // ..................*....................................... + // sub v26.8H, v13.8H, v14.8H // ......*................................................... + // add v0.8H, v13.8H, v14.8H // .............*............................................ + // ldr q23, [x4, #-64] // .......*.................................................. + // sqrdmulh v27.8H, v26.8H, v27.8H // .........*................................................ + // ldr q20, [x4, #-16] // ...*...................................................... + // mul v23.8H, v26.8H, v23.8H // ..........*............................................... + // sqrdmulh v3.8H, v29.8H, v20.8H // ........*................................................. + // mls v23.8H, v27.8H, v7.H[0] // ..............*........................................... + // add v27.8H, v15.8H, v16.8H // ...........*.............................................. + // mls v18.8H, v3.8H, v7.H[0] // ............*............................................. + // sub v29.8H, v0.8H, v27.8H // ................*......................................... + // add v28.8H, v0.8H, v27.8H // ........................*................................. + // sub v3.8H, v23.8H, v18.8H // .................*........................................ + // mul v20.8H, v29.8H, v19.8H // ......................*................................... + // sqrdmulh v29.8H, v29.8H, v22.8H // ....................*..................................... + // mul v19.8H, v3.8H, v19.8H // .....................*.................................... + // sqrdmulh v22.8H, v3.8H, v22.8H // ...................*...................................... + // add v9.8H, v23.8H, v18.8H // .......................*.................................. + // mls v20.8H, v29.8H, v7.H[0] // ..........................*............................... + // mls v19.8H, v22.8H, v7.H[0] // .........................*................................ + // trn2 v31.4S, v28.4S, v9.4S // ...............................*.......................... + // ldr q11, [x3], #16 // ............................*............................. + // trn2 v21.4S, v20.4S, v19.4S // ..............................*........................... + // trn1 v13.4S, v28.4S, v9.4S // ...........................*.............................. + // trn1 v28.4S, v20.4S, v19.4S // .............................*............................ + // trn2 v27.2D, v31.2D, v21.2D // ...................................*...................... + // trn1 v19.2D, v31.2D, v21.2D // ..................................*....................... + // trn2 v0.2D, v13.2D, v28.2D // .................................*........................ + // trn1 v22.2D, v13.2D, v28.2D // ................................*......................... + // add v9.8H, v0.8H, v27.8H // .....................................*.................... + // add v24.8H, v22.8H, v19.8H // ....................................*..................... + // sqdmulh v23.8H, v9.8H, v7.H[1] // ........................................*................. + // sqdmulh v13.8H, v24.8H, v7.H[1] // .......................................*.................. + // sub v31.8H, v22.8H, v19.8H // .............................................*............ + // srshr v23.8H, v23.8H, #11 // ............................................*............. + // srshr v1.8H, v13.8H, #11 // ...........................................*.............. + // sub v21.8H, v0.8H, v27.8H // ......................................*................... + // mls v9.8H, v23.8H, v7.H[0] // ...............................................*.......... + // mls v24.8H, v1.8H, v7.H[0] // ..............................................*........... + // sqrdmulh v19.8H, v31.8H, v11.H[3] // .................................................*........ + // mul v28.8H, v31.8H, v11.H[2] // ....................................................*..... + // mul v8.8H, v21.8H, v11.H[4] // ..........................................*............... + // sub v22.8H, v24.8H, v9.8H // ..................................................*....... + // sqrdmulh v23.8H, v21.8H, v11.H[5] // .........................................*................ + // add v12.8H, v24.8H, v9.8H // ...................................................*...... + // mul v27.8H, v22.8H, v11.H[0] // ......................................................*... + // sqrdmulh v13.8H, v22.8H, v11.H[1] // .....................................................*.... + // mls v28.8H, v19.8H, v7.H[0] // ........................................................*. + // mls v8.8H, v23.8H, v7.H[0] // ................................................*......... + // str q12, [x1], #(64) // .......................................................*.. + // mls v27.8H, v13.8H, v7.H[0] // .........................................................* sub count, count, #1 layer4567_start: - ldr q13, [x4, #16] // ..e............................................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - ldr q10, [x4, #32] // ...e.............................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // e................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - ldr q28, [x4, #64] // .....e............................................................ - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sub v30.8H, v3.8H, v4.8H // .......e.......................................................... - // gap // .................................................................. - sub v16.8H, v5.8H, v6.8H // ............e..................................................... - // gap // .................................................................. - ldr q21, [x4, #80] // ......e........................................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mul v27.8H, v16.8H, v28.8H // ..............e................................................... - // gap // .................................................................. - ldr q20, [x4, #48] // ....e............................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sqrdmulh v17.8H, v16.8H, v21.8H // ...............e.................................................. - // gap // .................................................................. - mul v16.8H, v30.8H, v10.8H // .........e........................................................ - // gap // .................................................................. - sqrdmulh v20.8H, v30.8H, v20.8H // ..........e....................................................... - // gap // .................................................................. - add v8.8H, v5.8H, v6.8H // .............e.................................................... - // gap // .................................................................. - add v4.8H, v3.8H, v4.8H // ........e......................................................... - // gap // .................................................................. - mls v27.8H, v17.8H, v7.H[0] // ................e................................................. - // gap // .................................................................. - mls v16.8H, v20.8H, v7.H[0] // ...........e...................................................... - // gap // .................................................................. - sub v26.8H, v4.8H, v8.8H // .................e................................................ - // gap // .................................................................. - ldr q25, [x4], #(6*16) // .e................................................................ - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sub v12.8H, v16.8H, v27.8H // ......................e........................................... - // gap // .................................................................. - mul v21.8H, v15.8H, v11.H[4] // ...........................................*...................... - // gap // .................................................................. - mul v24.8H, v26.8H, v25.8H // ...................e.............................................. - // gap // .................................................................. - mul v14.8H, v12.8H, v25.8H // ........................e......................................... - // gap // .................................................................. - sqrdmulh v2.8H, v15.8H, v11.H[5] // ............................................*..................... - // gap // .................................................................. - mul v23.8H, v0.8H, v11.H[2] // ......................................*........................... - // gap // .................................................................. - sqrdmulh v0.8H, v0.8H, v11.H[3] // .......................................*.......................... - // gap // .................................................................. - add v20.8H, v4.8H, v8.8H // ..................e............................................... - // gap // .................................................................. - mls v21.8H, v2.8H, v7.H[0] // .............................................*.................... - // gap // .................................................................. - sqrdmulh v26.8H, v26.8H, v13.8H // ....................e............................................. - // gap // .................................................................. - mls v23.8H, v0.8H, v7.H[0] // ........................................*......................... - // gap // .................................................................. - sqrdmulh v0.8H, v12.8H, v13.8H // .........................e........................................ - // gap // .................................................................. - add v16.8H, v16.8H, v27.8H // .......................e.......................................... - // gap // .................................................................. - mls v24.8H, v26.8H, v7.H[0] // .....................e............................................ - // gap // .................................................................. - add v30.8H, v23.8H, v21.8H // ..........................................................*....... - // gap // .................................................................. - trn1 v2.4S, v20.4S, v16.4S // ...........................e...................................... - // gap // .................................................................. - trn2 v28.4S, v20.4S, v16.4S // ............................e..................................... - // gap // .................................................................. - sqrdmulh v16.8H, v31.8H, v11.H[1] // .......................................................*.......... - // gap // .................................................................. - mls v14.8H, v0.8H, v7.H[0] // ..........................e....................................... - // gap // .................................................................. - sub v26.8H, v23.8H, v21.8H // .........................................................*........ - // gap // .................................................................. - mul v22.8H, v31.8H, v11.H[0] // ......................................................*........... - // gap // .................................................................. - str q30, [x1, #-48] // ...............................................................*.. - // gap // .................................................................. - trn1 v23.4S, v24.4S, v14.4S // .............................e.................................... - // gap // .................................................................. - trn2 v0.4S, v24.4S, v14.4S // ..............................e................................... - // gap // .................................................................. - mls v22.8H, v16.8H, v7.H[0] // ........................................................*......... - // gap // .................................................................. - trn2 v21.2D, v2.2D, v23.2D // ...............................e.................................. - // gap // .................................................................. - trn2 v29.2D, v28.2D, v0.2D // ................................e................................. - // gap // .................................................................. - trn1 v2.2D, v2.2D, v23.2D // .................................e................................ - // gap // .................................................................. - trn1 v0.2D, v28.2D, v0.2D // ..................................e............................... - // gap // .................................................................. - sub v15.8H, v21.8H, v29.8H // .........................................e........................ - // gap // .................................................................. - add v23.8H, v2.8H, v0.8H // .....................................e............................ - // gap // .................................................................. - add v16.8H, v21.8H, v29.8H // ..........................................e....................... - // gap // .................................................................. - mul v21.8H, v26.8H, v11.H[0] // ...........................................................*...... - // gap // .................................................................. - sqdmulh v17.8H, v23.8H, v7.H[1] // ..............................................e................... - // gap // .................................................................. - sqdmulh v20.8H, v16.8H, v7.H[1] // .................................................e................ - // gap // .................................................................. - sub v0.8H, v2.8H, v0.8H // ....................................e............................. - // gap // .................................................................. - sqrdmulh v2.8H, v26.8H, v11.H[1] // ............................................................*..... - // gap // .................................................................. - srshr v17.8H, v17.8H, #11 // ...............................................e.................. - // gap // .................................................................. - srshr v20.8H, v20.8H, #11 // ..................................................e............... - // gap // .................................................................. - str q22, [x1, #-32] // ................................................................*. - // gap // .................................................................. - mls v23.8H, v17.8H, v7.H[0] // ................................................e................. - // gap // .................................................................. - mls v16.8H, v20.8H, v7.H[0] // ...................................................e.............. - // gap // .................................................................. - mls v21.8H, v2.8H, v7.H[0] // .............................................................*.... - // gap // .................................................................. - ldr q11, [x3], #16 // ...................................e.............................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - add v20.8H, v23.8H, v16.8H // .....................................................e............ - // gap // .................................................................. - str q21, [x1, #-16] // .................................................................* - // gap // .................................................................. - sub v31.8H, v23.8H, v16.8H // ....................................................e............. - // gap // .................................................................. - str q20, [x1], #(64) // ..............................................................e... - // gap // .................................................................. + ldr q22, [x4, #16] // ..e..................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x1] // e....................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q24, [x4, #64] // .....e.................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v29.8H, v15.8H, v16.8H // ............e........................................................... + // gap // ........................................................................ + sqdmulh v23.8H, v28.8H, v7.H[1] // .................................................*...................... + // gap // ........................................................................ + str q27, [x1, #-32] // ......................................................................*. + // gap // ........................................................................ + mul v18.8H, v29.8H, v24.8H // ..............e......................................................... + // gap // ........................................................................ + ldr q27, [x4, #48] // ....e................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q19, [x4], #(6*16) // .e...................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + srshr v23.8H, v23.8H, #11 // ..................................................*..................... + // gap // ........................................................................ + sqdmulh v24.8H, v8.8H, v7.H[1] // .......................................................*................ + // gap // ........................................................................ + sub v26.8H, v13.8H, v14.8H // .......e................................................................ + // gap // ........................................................................ + add v0.8H, v13.8H, v14.8H // ........e............................................................... + // gap // ........................................................................ + mls v28.8H, v23.8H, v7.H[0] // ...................................................*.................... + // gap // ........................................................................ + srshr v24.8H, v24.8H, #11 // ........................................................*............... + // gap // ........................................................................ + ldr q23, [x4, #-64] // ...e.................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v8.8H, v24.8H, v7.H[0] // .........................................................*.............. + // gap // ........................................................................ + sqrdmulh v27.8H, v26.8H, v27.8H // ..........e............................................................. + // gap // ........................................................................ + ldr q20, [x4, #-16] // ......e................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v30.8H, v28.8H, v8.8H // ...............................................................*........ + // gap // ........................................................................ + mul v23.8H, v26.8H, v23.8H // .........e.............................................................. + // gap // ........................................................................ + sqrdmulh v3.8H, v29.8H, v20.8H // ...............e........................................................ + // gap // ........................................................................ + add v28.8H, v28.8H, v8.8H // ................................................................*....... + // gap // ........................................................................ + sqrdmulh v26.8H, v30.8H, v11.H[1] // ..................................................................*..... + // gap // ........................................................................ + mls v23.8H, v27.8H, v7.H[0] // ...........e............................................................ + // gap // ........................................................................ + add v27.8H, v15.8H, v16.8H // .............e.......................................................... + // gap // ........................................................................ + mls v18.8H, v3.8H, v7.H[0] // ................e....................................................... + // gap // ........................................................................ + str q28, [x1, #-48] // .....................................................................*.. + // gap // ........................................................................ + sub v29.8H, v0.8H, v27.8H // .................e...................................................... + // gap // ........................................................................ + add v28.8H, v0.8H, v27.8H // ..................e..................................................... + // gap // ........................................................................ + sub v3.8H, v23.8H, v18.8H // ......................e................................................. + // gap // ........................................................................ + mul v20.8H, v29.8H, v19.8H // ...................e.................................................... + // gap // ........................................................................ + sqrdmulh v29.8H, v29.8H, v22.8H // ....................e................................................... + // gap // ........................................................................ + mul v19.8H, v3.8H, v19.8H // ........................e............................................... + // gap // ........................................................................ + sqrdmulh v22.8H, v3.8H, v22.8H // .........................e.............................................. + // gap // ........................................................................ + add v9.8H, v23.8H, v18.8H // .......................e................................................ + // gap // ........................................................................ + mls v20.8H, v29.8H, v7.H[0] // .....................e.................................................. + // gap // ........................................................................ + mul v3.8H, v30.8H, v11.H[0] // .................................................................*...... + // gap // ........................................................................ + mls v19.8H, v22.8H, v7.H[0] // ..........................e............................................. + // gap // ........................................................................ + trn2 v31.4S, v28.4S, v9.4S // ............................e........................................... + // gap // ........................................................................ + ldr q11, [x3], #16 // ...................................e.................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v21.4S, v20.4S, v19.4S // ..............................e......................................... + // gap // ........................................................................ + trn1 v13.4S, v28.4S, v9.4S // ...........................e............................................ + // gap // ........................................................................ + trn1 v28.4S, v20.4S, v19.4S // .............................e.......................................... + // gap // ........................................................................ + trn2 v27.2D, v31.2D, v21.2D // ................................e....................................... + // gap // ........................................................................ + trn1 v19.2D, v31.2D, v21.2D // ..................................e..................................... + // gap // ........................................................................ + trn2 v0.2D, v13.2D, v28.2D // ...............................e........................................ + // gap // ........................................................................ + trn1 v22.2D, v13.2D, v28.2D // .................................e...................................... + // gap // ........................................................................ + add v9.8H, v0.8H, v27.8H // ..........................................e............................. + // gap // ........................................................................ + add v24.8H, v22.8H, v19.8H // .....................................e.................................. + // gap // ........................................................................ + mls v3.8H, v26.8H, v7.H[0] // ...................................................................*.... + // gap // ........................................................................ + sqdmulh v23.8H, v9.8H, v7.H[1] // ....................................................e................... + // gap // ........................................................................ + sqdmulh v13.8H, v24.8H, v7.H[1] // ..............................................e......................... + // gap // ........................................................................ + sub v31.8H, v22.8H, v19.8H // ....................................e................................... + // gap // ........................................................................ + str q3, [x1, #-16] // .......................................................................* + // gap // ........................................................................ + srshr v23.8H, v23.8H, #11 // .....................................................e.................. + // gap // ........................................................................ + srshr v1.8H, v13.8H, #11 // ...............................................e........................ + // gap // ........................................................................ + sub v21.8H, v0.8H, v27.8H // .........................................e.............................. + // gap // ........................................................................ + mls v9.8H, v23.8H, v7.H[0] // ......................................................e................. + // gap // ........................................................................ + mls v24.8H, v1.8H, v7.H[0] // ................................................e....................... + // gap // ........................................................................ + sqrdmulh v19.8H, v31.8H, v11.H[3] // .......................................e................................ + // gap // ........................................................................ + mul v28.8H, v31.8H, v11.H[2] // ......................................e................................. + // gap // ........................................................................ + mul v8.8H, v21.8H, v11.H[4] // ...........................................e............................ + // gap // ........................................................................ + sub v22.8H, v24.8H, v9.8H // ..........................................................e............. + // gap // ........................................................................ + sqrdmulh v23.8H, v21.8H, v11.H[5] // ............................................e........................... + // gap // ........................................................................ + add v12.8H, v24.8H, v9.8H // ...........................................................e............ + // gap // ........................................................................ + mul v27.8H, v22.8H, v11.H[0] // ............................................................e........... + // gap // ........................................................................ + sqrdmulh v13.8H, v22.8H, v11.H[1] // .............................................................e.......... + // gap // ........................................................................ + mls v28.8H, v19.8H, v7.H[0] // ........................................e............................... + // gap // ........................................................................ + mls v8.8H, v23.8H, v7.H[0] // .............................................e.......................... + // gap // ........................................................................ + str q12, [x1], #(64) // ....................................................................e... + // gap // ........................................................................ + mls v27.8H, v13.8H, v7.H[0] // ..............................................................e......... + // gap // ........................................................................ // original source code - // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // ..e...............................................................|.e............................................................. - // ldr q0, [x4], #(6*16) // .................e................................................|................e.............................................. - // ldr q4, [x4, #(-6*16 + 1*16)] // e.................................................................e............................................................... - // ldr q1, [x4, #(-6*16 + 2*16)] // .e................................................................|e.............................................................. - // ldr q5, [x4, #(-6*16 + 3*16)] // ........e.........................................................|.......e....................................................... - // ldr q2, [x4, #(-6*16 + 4*16)] // ...e..............................................................|..e............................................................ - // ldr q6, [x4, #(-6*16 + 5*16)] // ......e...........................................................|.....e......................................................... - // sub v24.8h, v8.8h, v9.8h // ....e.............................................................|...e........................................................... - // add v8.8h, v8.8h, v9.8h // .............e....................................................|............e.................................................. - // mul v9.8h, v24.8h, v1.8h // ..........e.......................................................|.........e..................................................... - // sqrdmulh v24.8h, v24.8h, v5.8h // ...........e......................................................|..........e.................................................... - // mls v9.8h, v24.8h, v7.h[0] // ...............e..................................................|..............e................................................ - // sub v24.8h, v10.8h, v11.8h // .....e............................................................|....e.......................................................... - // add v10.8h, v10.8h, v11.8h // ............e.....................................................|...........e................................................... - // mul v11.8h, v24.8h, v2.8h // .......e..........................................................|......e........................................................ - // sqrdmulh v24.8h, v24.8h, v6.8h // .........e........................................................|........e...................................................... - // mls v11.8h, v24.8h, v7.h[0] // ..............e...................................................|.............e................................................. - // sub v24.8h, v8.8h, v10.8h // ................e.................................................|...............e............................................... - // add v8.8h, v8.8h, v10.8h // .........................e........................................|........................e...................................... - // mul v10.8h, v24.8h, v0.8h // ....................e.............................................|...................e........................................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ...........................e......................................|..........................e.................................... - // mls v10.8h, v24.8h, v7.h[0] // ...............................e..................................|..............................e................................ - // sub v24.8h, v9.8h, v11.8h // ..................e...............................................|.................e............................................. - // add v9.8h, v9.8h, v11.8h // ..............................e...................................|.............................e................................. - // mul v11.8h, v24.8h, v0.8h // .....................e............................................|....................e.......................................... - // sqrdmulh v24.8h, v24.8h, v4.8h // .............................e....................................|............................e.................................. - // mls v11.8h, v24.8h, v7.h[0] // ....................................e.............................|...................................e........................... - // trn1 v25.4s, v8.4s, v9.4s // .................................e................................|................................e.............................. - // trn2 v26.4s, v8.4s, v9.4s // ..................................e...............................|.................................e............................. - // trn1 v27.4s, v10.4s, v11.4s // ........................................e.........................|.......................................e....................... - // trn2 v28.4s, v10.4s, v11.4s // .........................................e........................|........................................e...................... - // trn2 v10.2d, v25.2d, v27.2d // ...........................................e......................|..........................................e.................... - // trn2 v11.2d, v26.2d, v28.2d // ............................................e.....................|...........................................e................... - // trn1 v8.2d, v25.2d, v27.2d // .............................................e....................|............................................e.................. - // trn1 v9.2d, v26.2d, v28.2d // ..............................................e...................|.............................................e................. - // ldr q0, [x3], #16 // .............................................................e....|............................................................e.. - // sub v24.8h, v8.8h, v9.8h // .....................................................e............|....................................................e.......... - // add v8.8h, v8.8h, v9.8h // ................................................e.................|...............................................e............... - // mul v9.8h, v24.8h, v0.h[2] // .......................*..........................................|......................*........................................ - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ........................*.........................................|.......................*....................................... - // mls v9.8h, v24.8h, v7.h[0] // ............................*.....................................|...........................*................................... - // sub v24.8h, v10.8h, v11.8h // ...............................................e..................|..............................................e................ - // add v10.8h, v10.8h, v11.8h // .................................................e................|................................................e.............. - // mul v11.8h, v24.8h, v0.h[4] // ...................*..............................................|..................*............................................ - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ......................*...........................................|.....................*......................................... - // mls v11.8h, v24.8h, v7.h[0] // ..........................*.......................................|.........................*..................................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // ...................................................e..............|..................................................e............ - // srshr v25.8h, v25.8h, #11 // .......................................................e..........|......................................................e........ - // mls v8.8h, v25.8h, v7.h[0] // ..........................................................e.......|.........................................................e..... - // sqdmulh v25.8h, v10.8h, v7.h[1] // ....................................................e.............|...................................................e........... - // srshr v25.8h, v25.8h, #11 // ........................................................e.........|.......................................................e....... - // mls v10.8h, v25.8h, v7.h[0] // ...........................................................e......|..........................................................e.... - // sub v24.8h, v8.8h, v10.8h // ................................................................e.|............................................................... - // add v8.8h, v8.8h, v10.8h // ..............................................................e...|.............................................................e. - // mul v10.8h, v24.8h, v0.h[0] // ......................................*...........................|.....................................*......................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...................................*..............................|..................................*............................ - // mls v10.8h, v24.8h, v7.h[0] // ..........................................*.......................|.........................................*..................... - // sub v24.8h, v9.8h, v11.8h // .....................................*............................|....................................*.......................... - // add v9.8h, v9.8h, v11.8h // ................................*.................................|...............................*............................... - // mul v11.8h, v24.8h, v0.h[0] // ..................................................*...............|.................................................*............. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ......................................................*...........|.....................................................*......... - // mls v11.8h, v24.8h, v7.h[0] // ............................................................*.....|...........................................................*... - // str q8, [x1], #(64) // .................................................................e|............................................................... - // str q9, [x1, #(-64 + 16*1)] // .......................................*..........................|......................................*........................ - // str q10, [x1, #(-64 + 16*2)] // .........................................................*........|........................................................*...... - // str q11, [x1, #(-64 + 16*3)] // ...............................................................*..|..............................................................* + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // .e......................................................................|e..................................................... + // ldr q0, [x4], #(6*16) // ........e...............................................................|.......e.............................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // e.......................................................................e...................................................... + // ldr q1, [x4, #(-6*16 + 2*16)] // ...............e........................................................|..............e....................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // .......e................................................................|......e............................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // ..e.....................................................................|.e.................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ..................e.....................................................|.................e.................................... + // sub v24.8h, v8.8h, v9.8h // ...........e............................................................|..........e........................................... + // add v8.8h, v8.8h, v9.8h // ............e...........................................................|...........e.......................................... + // mul v9.8h, v24.8h, v1.8h // ....................e...................................................|...................e.................................. + // sqrdmulh v24.8h, v24.8h, v5.8h // .................e......................................................|................e..................................... + // mls v9.8h, v24.8h, v7.h[0] // ........................e...............................................|.......................e.............................. + // sub v24.8h, v10.8h, v11.8h // ...e....................................................................|..e................................................... + // add v10.8h, v10.8h, v11.8h // .........................e..............................................|........................e............................. + // mul v11.8h, v24.8h, v2.8h // ......e.................................................................|.....e................................................ + // sqrdmulh v24.8h, v24.8h, v6.8h // .....................e..................................................|....................e................................. + // mls v11.8h, v24.8h, v7.h[0] // ..........................e.............................................|.........................e............................ + // sub v24.8h, v8.8h, v10.8h // ............................e...........................................|...........................e.......................... + // add v8.8h, v8.8h, v10.8h // .............................e..........................................|............................e......................... + // mul v10.8h, v24.8h, v0.8h // ...............................e........................................|..............................e....................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ................................e.......................................|...............................e...................... + // mls v10.8h, v24.8h, v7.h[0] // ....................................e...................................|...................................e.................. + // sub v24.8h, v9.8h, v11.8h // ..............................e.........................................|.............................e........................ + // add v9.8h, v9.8h, v11.8h // ...................................e....................................|..................................e................... + // mul v11.8h, v24.8h, v0.8h // .................................e......................................|................................e..................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ..................................e.....................................|.................................e.................... + // mls v11.8h, v24.8h, v7.h[0] // ......................................e.................................|.....................................e................ + // trn1 v25.4s, v8.4s, v9.4s // ..........................................e.............................|.........................................e............ + // trn2 v26.4s, v8.4s, v9.4s // .......................................e................................|......................................e............... + // trn1 v27.4s, v10.4s, v11.4s // ...........................................e............................|..........................................e........... + // trn2 v28.4s, v10.4s, v11.4s // .........................................e..............................|........................................e............. + // trn2 v10.2d, v25.2d, v27.2d // ..............................................e.........................|.............................................e........ + // trn2 v11.2d, v26.2d, v28.2d // ............................................e...........................|...........................................e.......... + // trn1 v8.2d, v25.2d, v27.2d // ...............................................e........................|..............................................e....... + // trn1 v9.2d, v26.2d, v28.2d // .............................................e..........................|............................................e......... + // ldr q0, [x3], #16 // ........................................e...............................|.......................................e.............. + // sub v24.8h, v8.8h, v9.8h // .....................................................e..................|....................................................e. + // add v8.8h, v8.8h, v9.8h // .................................................e......................|................................................e..... + // mul v9.8h, v24.8h, v0.h[2] // .............................................................e..........|...................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ............................................................e...........|...................................................... + // mls v9.8h, v24.8h, v7.h[0] // ....................................................................e...|...................................................... + // sub v24.8h, v10.8h, v11.8h // .........................................................e..............|...................................................... + // add v10.8h, v10.8h, v11.8h // ................................................e.......................|...............................................e...... + // mul v11.8h, v24.8h, v0.h[4] // ..............................................................e.........|...................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ................................................................e.......|...................................................... + // mls v11.8h, v24.8h, v7.h[0] // .....................................................................e..|...................................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ....................................................e...................|...................................................e.. + // srshr v25.8h, v25.8h, #11 // ........................................................e...............|...................................................... + // mls v8.8h, v25.8h, v7.h[0] // ...........................................................e............|...................................................... + // sqdmulh v25.8h, v9.8h, v7.h[1] // ....*...................................................................|...*.................................................. + // srshr v25.8h, v25.8h, #11 // .........*..............................................................|........*............................................. + // mls v9.8h, v25.8h, v7.h[0] // .............*..........................................................|............*......................................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ...................................................e....................|..................................................e... + // srshr v25.8h, v25.8h, #11 // .......................................................e................|...................................................... + // mls v10.8h, v25.8h, v7.h[0] // ..........................................................e.............|...................................................... + // sqdmulh v25.8h, v11.8h, v7.h[1] // ..........*.............................................................|.........*............................................ + // srshr v25.8h, v25.8h, #11 // ..............*.........................................................|.............*........................................ + // mls v11.8h, v25.8h, v7.h[0] // ................*.......................................................|...............*...................................... + // sub v24.8h, v8.8h, v10.8h // ...............................................................e........|...................................................... + // add v8.8h, v8.8h, v10.8h // .................................................................e......|...................................................... + // mul v10.8h, v24.8h, v0.h[0] // ..................................................................e.....|...................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...................................................................e....|...................................................... + // mls v10.8h, v24.8h, v7.h[0] // .......................................................................e|...................................................... + // sub v24.8h, v9.8h, v11.8h // ...................*....................................................|..................*................................... + // add v9.8h, v9.8h, v11.8h // ......................*.................................................|.....................*................................ + // mul v11.8h, v24.8h, v0.h[0] // .....................................*..................................|....................................*................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................*................................................|......................*............................... + // mls v11.8h, v24.8h, v7.h[0] // ..................................................*.....................|.................................................*.... + // str q8, [x1], #(64) // ......................................................................e.|...................................................... + // str q9, [x1, #(-64 + 16*1)] // ...........................*............................................|..........................*........................... + // str q10, [x1, #(-64 + 16*2)] // .....*..................................................................|....*................................................. + // str q11, [x1, #(-64 + 16*3)] // ......................................................*.................|.....................................................* sub count, count, #1 cbnz count, layer4567_start - mul v2.8H, v15.8H, v11.H[4] // *................ - // gap // ................. - sqrdmulh v16.8H, v15.8H, v11.H[5] // .*............... - // gap // ................. - mul v23.8H, v0.8H, v11.H[2] // ..*.............. - // gap // ................. - sqrdmulh v0.8H, v0.8H, v11.H[3] // ...*............. - // gap // ................. - sqrdmulh v21.8H, v31.8H, v11.H[1] // .......*......... - // gap // ................. - mul v26.8H, v31.8H, v11.H[0] // .........*....... - // gap // ................. - mls v2.8H, v16.8H, v7.H[0] // ....*............ - // gap // ................. - mls v23.8H, v0.8H, v7.H[0] // .....*........... - // gap // ................. - // gap // ................. - // gap // ................. - mls v26.8H, v21.8H, v7.H[0] // ...........*..... - // gap // ................. - // gap // ................. - // gap // ................. - sub v0.8H, v23.8H, v2.8H // ........*........ - // gap // ................. - add v2.8H, v23.8H, v2.8H // ......*.......... - // gap // ................. - str q26, [x1, #-32] // ..............*.. - // gap // ................. - mul v16.8H, v0.8H, v11.H[0] // ............*.... - // gap // ................. - sqrdmulh v0.8H, v0.8H, v11.H[1] // .............*... - // gap // ................. - str q2, [x1, #-48] // ..........*...... - // gap // ................. - // gap // ................. - // gap // ................. - // gap // ................. - // gap // ................. - mls v16.8H, v0.8H, v7.H[0] // ...............*. - // gap // ................. - // gap // ................. - // gap // ................. - // gap // ................. - // gap // ................. - // gap // ................. - // gap // ................. - str q16, [x1, #-16] // ................* - // gap // ................. + sqdmulh v19.8H, v28.8H, v7.H[1] // *............. + // gap // .............. + sqdmulh v23.8H, v8.8H, v7.H[1] // ...*.......... + // gap // .............. + str q27, [x1, #-32] // .*............ + // gap // .............. + // gap // .............. + // gap // .............. + srshr v19.8H, v19.8H, #11 // ..*........... + // gap // .............. + srshr v23.8H, v23.8H, #11 // .....*........ + // gap // .............. + // gap // .............. + // gap // .............. + mls v28.8H, v19.8H, v7.H[0] // ....*......... + // gap // .............. + mls v8.8H, v23.8H, v7.H[0] // ......*....... + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + sub v19.8H, v28.8H, v8.8H // .......*...... + // gap // .............. + add v23.8H, v28.8H, v8.8H // ........*..... + // gap // .............. + // gap // .............. + // gap // .............. + sqrdmulh v22.8H, v19.8H, v11.H[1] // .........*.... + // gap // .............. + mul v19.8H, v19.8H, v11.H[0] // ...........*.. + // gap // .............. + str q23, [x1, #-48] // ..........*... + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + mls v19.8H, v22.8H, v7.H[0] // ............*. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + str q19, [x1, #-16] // .............* + // gap // .............. // original source code - // mul v21.8H, v15.8H, v11.H[4] // *................ - // sqrdmulh v2.8H, v15.8H, v11.H[5] // .*............... - // mul v23.8H, v0.8H, v11.H[2] // ..*.............. - // sqrdmulh v0.8H, v0.8H, v11.H[3] // ...*............. - // mls v21.8H, v2.8H, v7.H[0] // ......*.......... - // mls v23.8H, v0.8H, v7.H[0] // .......*......... - // add v30.8H, v23.8H, v21.8H // ..........*...... - // sqrdmulh v16.8H, v31.8H, v11.H[1] // ....*............ - // sub v26.8H, v23.8H, v21.8H // .........*....... - // mul v22.8H, v31.8H, v11.H[0] // .....*........... - // str q30, [x1, #-48] // ..............*.. - // mls v22.8H, v16.8H, v7.H[0] // ........*........ - // mul v21.8H, v26.8H, v11.H[0] // ............*.... - // sqrdmulh v2.8H, v26.8H, v11.H[1] // .............*... - // str q22, [x1, #-32] // ...........*..... - // mls v21.8H, v2.8H, v7.H[0] // ...............*. - // str q21, [x1, #-16] // ................* + // sqdmulh v23.8H, v28.8H, v7.H[1] // *............. + // str q27, [x1, #-32] // ..*........... + // srshr v23.8H, v23.8H, #11 // ...*.......... + // sqdmulh v24.8H, v8.8H, v7.H[1] // .*............ + // mls v28.8H, v23.8H, v7.H[0] // .....*........ + // srshr v24.8H, v24.8H, #11 // ....*......... + // mls v8.8H, v24.8H, v7.H[0] // ......*....... + // sub v30.8H, v28.8H, v8.8H // .......*...... + // add v28.8H, v28.8H, v8.8H // ........*..... + // sqrdmulh v26.8H, v30.8H, v11.H[1] // .........*.... + // str q28, [x1, #-48] // ...........*.. + // mul v3.8H, v30.8H, v11.H[0] // ..........*... + // mls v3.8H, v26.8H, v7.H[0] // ............*. + // str q3, [x1, #-16] // .............* // --------------------------------------------------------------------- @@ -866,616 +902,580 @@ layer4567_start: .p2align 2 - ldr q20, [x0, #64] // *...... - // gap // ....... - // gap // ....... - // gap // ....... - ldr q23, [x0, #128] // .*..... - // gap // ....... - // gap // ....... - // gap // ....... - ldr q21, [x0, #192] // ..*.... - // gap // ....... - // gap // ....... - // gap // ....... - ldr q17, [x0, #256] // ...*... - // gap // ....... - // gap // ....... - // gap // ....... - ldr q4, [x0, #320] // ....*.. - // gap // ....... - // gap // ....... - // gap // ....... - ldr q13, [x0, #384] // .....*. - // gap // ....... - // gap // ....... - // gap // ....... - ldr q3, [x0, #448] // ......* - // gap // ....... + ldr q3, [x0, #256] // *........... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q11, [x0, #448] // .*.......... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q20, [x0, #384] // ..*......... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q26, [x0, #320] // ...*........ + // gap // ............ + // gap // ............ + // gap // ............ + add v5.8H, v20.8H, v11.8H // .........*.. + // gap // ............ + ldr q28, [x0, #192] // .....*...... + // gap // ............ + // gap // ............ + // gap // ............ + add v25.8H, v3.8H, v26.8H // ........*... + // gap // ............ + ldr q22, [x0, #128] // ......*..... + // gap // ............ + // gap // ............ + // gap // ............ + add v14.8H, v25.8H, v5.8H // ...........* + // gap // ............ + ldr q23, [x0, #64] // ....*....... + // gap // ............ + // gap // ............ + // gap // ............ + add v24.8H, v22.8H, v28.8H // .......*.... + // gap // ............ + ldr q27, [x0, #0] // ..........*. + // gap // ............ // original source code - // ldr q20, [x0, #64] // *...... - // ldr q23, [x0, #128] // .*..... - // ldr q21, [x0, #192] // ..*.... - // ldr q17, [x0, #256] // ...*... - // ldr q4, [x0, #320] // ....*.. - // ldr q13, [x0, #384] // .....*. - // ldr q3, [x0, #448] // ......* + // ldr q3, [x0, #256] // *........... + // ldr q11, [x0, #448] // .*.......... + // ldr q20, [x0, #384] // ..*......... + // ldr q26, [x0, #320] // ...*........ + // ldr q23, [x0, #64] // .........*.. + // ldr q28, [x0, #192] // .....*...... + // ldr q22, [x0, #128] // .......*.... + // add v24.8H, v22.8H, v28.8H // ..........*. + // add v25.8H, v3.8H, v26.8H // ......*..... + // add v5.8H, v20.8H, v11.8H // ....*....... + // ldr q27, [x0, #0] // ...........* + // add v14.8H, v25.8H, v5.8H // ........*... sub count, count, #1 layer123_start: - ldr q16, [x0, #0] // *............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v2.8H, v23.8H, v21.8H // .............*................................................................................ - // gap // .............................................................................................. - add v23.8H, v23.8H, v21.8H // ..............*............................................................................... - // gap // .............................................................................................. - sub v21.8H, v16.8H, v20.8H // ........*..................................................................................... - // gap // .............................................................................................. - add v16.8H, v16.8H, v20.8H // .........*.................................................................................... - // gap // .............................................................................................. - mul v26.8H, v2.8H, v1.H[0] // ...............*.............................................................................. - // gap // .............................................................................................. - sqrdmulh v2.8H, v2.8H, v1.H[1] // ................*............................................................................. - // gap // .............................................................................................. - sub v20.8H, v16.8H, v23.8H // ............................*................................................................. - // gap // .............................................................................................. - add v16.8H, v16.8H, v23.8H // .............................*................................................................ - // gap // .............................................................................................. - mul v23.8H, v21.8H, v0.H[6] // ..........*................................................................................... - // gap // .............................................................................................. - sqrdmulh v21.8H, v21.8H, v0.H[7] // ...........*.................................................................................. - // gap // .............................................................................................. - mls v26.8H, v2.8H, v7.H[0] // .................*............................................................................ - // gap // .............................................................................................. - sub v2.8H, v17.8H, v4.8H // ..................*........................................................................... - // gap // .............................................................................................. - add v17.8H, v17.8H, v4.8H // ...................*.......................................................................... - // gap // .............................................................................................. - mls v23.8H, v21.8H, v7.H[0] // ............*................................................................................. - // gap // .............................................................................................. - mul v21.8H, v2.8H, v1.H[2] // ....................*......................................................................... - // gap // .............................................................................................. - mul v4.8H, v20.8H, v0.H[2] // ..............................*............................................................... - // gap // .............................................................................................. - sqrdmulh v20.8H, v20.8H, v0.H[3] // ...............................*.............................................................. - // gap // .............................................................................................. - sqdmulh v25.8H, v16.8H, v7.H[1] // ................................................*............................................. - // gap // .............................................................................................. - sqrdmulh v2.8H, v2.8H, v1.H[3] // .....................*........................................................................ - // gap // .............................................................................................. - sub v11.8H, v13.8H, v3.8H // .......................*...................................................................... - // gap // .............................................................................................. - add v13.8H, v13.8H, v3.8H // ........................*..................................................................... - // gap // .............................................................................................. - srshr v25.8H, v25.8H, #11 // .................................................*............................................ - // gap // .............................................................................................. - mls v21.8H, v2.8H, v7.H[0] // ......................*....................................................................... - // gap // .............................................................................................. - sub v2.8H, v23.8H, v26.8H // .................................*............................................................ - // gap // .............................................................................................. - mls v16.8H, v25.8H, v7.H[0] // ..................................................*........................................... - // gap // .............................................................................................. - add v23.8H, v23.8H, v26.8H // ..................................*........................................................... - // gap // .............................................................................................. - mul v26.8H, v11.8H, v1.H[4] // .........................*.................................................................... - // gap // .............................................................................................. - sub v25.8H, v17.8H, v13.8H // ......................................*....................................................... - // gap // .............................................................................................. - add v17.8H, v17.8H, v13.8H // .......................................*...................................................... - // gap // .............................................................................................. - sqrdmulh v11.8H, v11.8H, v1.H[5] // ..........................*................................................................... - // gap // .............................................................................................. - mls v4.8H, v20.8H, v7.H[0] // ................................*............................................................. - // gap // .............................................................................................. - mul v20.8H, v2.8H, v0.H[2] // ...................................*.......................................................... - // gap // .............................................................................................. - sqrdmulh v2.8H, v2.8H, v0.H[3] // ....................................*......................................................... - // gap // .............................................................................................. - mls v26.8H, v11.8H, v7.H[0] // ...........................*.................................................................. - // gap // .............................................................................................. - mul v11.8H, v25.8H, v0.H[4] // ........................................*..................................................... - // gap // .............................................................................................. - sqrdmulh v25.8H, v25.8H, v0.H[5] // .........................................*.................................................... - // gap // .............................................................................................. - mls v20.8H, v2.8H, v7.H[0] // .....................................*........................................................ - // gap // .............................................................................................. - sub v2.8H, v21.8H, v26.8H // ...........................................*.................................................. - // gap // .............................................................................................. - add v21.8H, v21.8H, v26.8H // ............................................*................................................. - // gap // .............................................................................................. - mls v11.8H, v25.8H, v7.H[0] // ..........................................*................................................... - // gap // .............................................................................................. - mul v26.8H, v2.8H, v0.H[4] // .............................................*................................................ - // gap // .............................................................................................. - sqrdmulh v2.8H, v2.8H, v0.H[5] // ..............................................*............................................... - // gap // .............................................................................................. - sqdmulh v25.8H, v17.8H, v7.H[1] // ...................................................*.......................................... - // gap // .............................................................................................. - sub v13.8H, v23.8H, v21.8H // ...........................................................*.................................. - // gap // .............................................................................................. - add v23.8H, v23.8H, v21.8H // ............................................................*................................. - // gap // .............................................................................................. - mls v26.8H, v2.8H, v7.H[0] // ...............................................*.............................................. - // gap // .............................................................................................. - srshr v2.8H, v25.8H, #11 // ....................................................*......................................... - // gap // .............................................................................................. - mul v21.8H, v13.8H, v0.H[0] // .............................................................*................................ - // gap // .............................................................................................. - sqrdmulh v25.8H, v13.8H, v0.H[1] // ..............................................................*............................... - // gap // .............................................................................................. - mls v17.8H, v2.8H, v7.H[0] // .....................................................*........................................ - // gap // .............................................................................................. - sub v2.8H, v4.8H, v11.8H // ................................................................*............................. - // gap // .............................................................................................. - add v4.8H, v4.8H, v11.8H // .................................................................*............................ - // gap // .............................................................................................. - mls v21.8H, v25.8H, v7.H[0] // ...............................................................*.............................. - // gap // .............................................................................................. - sub v25.8H, v16.8H, v17.8H // ......................................................*....................................... - // gap // .............................................................................................. - add v16.8H, v16.8H, v17.8H // .......................................................*...................................... - // gap // .............................................................................................. - mul v17.8H, v2.8H, v0.H[0] // ..................................................................*........................... - // gap // .............................................................................................. - mul v11.8H, v25.8H, v0.H[0] // ........................................................*..................................... - // gap // .............................................................................................. - sqrdmulh v25.8H, v25.8H, v0.H[1] // .........................................................*.................................... - // gap // .............................................................................................. - sqrdmulh v2.8H, v2.8H, v0.H[1] // ...................................................................*.......................... - // gap // .............................................................................................. - sub v13.8H, v20.8H, v26.8H // .....................................................................*........................ - // gap // .............................................................................................. - add v26.8H, v20.8H, v26.8H // ......................................................................*....................... - // gap // .............................................................................................. - mls v11.8H, v25.8H, v7.H[0] // ..........................................................*................................... - // gap // .............................................................................................. - mls v17.8H, v2.8H, v7.H[0] // ....................................................................*......................... - // gap // .............................................................................................. - mul v2.8H, v13.8H, v0.H[0] // .......................................................................*...................... - // gap // .............................................................................................. - sqrdmulh v20.8H, v13.8H, v0.H[1] // ........................................................................*..................... - // gap // .............................................................................................. - str q11, [x0, #256] // ..........................................................................*................... - // gap // .............................................................................................. - mul v25.8H, v16.8H, v29.8H // ..............................................................................*............... - // gap // .............................................................................................. - str q21, [x0, #320] // ...........................................................................*.................. - // gap // .............................................................................................. - mls v2.8H, v20.8H, v7.H[0] // .........................................................................*.................... - // gap // .............................................................................................. - str q17, [x0, #384] // ............................................................................*................. - // gap // .............................................................................................. - sqrdmulh v16.8H, v16.8H, v30.8H // ...............................................................................*.............. - // gap // .............................................................................................. - mul v21.8H, v23.8H, v29.8H // .................................................................................*............ - // gap // .............................................................................................. - str q2, [x0, #448] // .............................................................................*................ - // gap // .............................................................................................. - sqrdmulh v2.8H, v23.8H, v30.8H // ..................................................................................*........... - // gap // .............................................................................................. - mls v25.8H, v16.8H, v7.H[0] // ................................................................................*............. - // gap // .............................................................................................. - mul v16.8H, v4.8H, v29.8H // ....................................................................................*......... - // gap // .............................................................................................. - sqrdmulh v23.8H, v4.8H, v30.8H // .....................................................................................*........ - // gap // .............................................................................................. - mls v21.8H, v2.8H, v7.H[0] // ...................................................................................*.......... - // gap // .............................................................................................. - mul v2.8H, v26.8H, v29.8H // .......................................................................................*...... - // gap // .............................................................................................. - sqrdmulh v26.8H, v26.8H, v30.8H // ........................................................................................*..... - // gap // .............................................................................................. - mls v16.8H, v23.8H, v7.H[0] // ......................................................................................*....... - // gap // .............................................................................................. - str q25, [x0], #(16) // ..........................................................................................*... - // gap // .............................................................................................. - ldr q20, [x0, #64] // .e............................................................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v2.8H, v26.8H, v7.H[0] // .........................................................................................*.... - // gap // .............................................................................................. - str q21, [x0, #48] // ...........................................................................................*.. - // gap // .............................................................................................. - ldr q23, [x0, #128] // ..e........................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - str q16, [x0, #112] // ............................................................................................*. - // gap // .............................................................................................. - ldr q21, [x0, #192] // ...e.......................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - str q2, [x0, #176] // .............................................................................................* - // gap // .............................................................................................. - ldr q17, [x0, #256] // ....e......................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - ldr q4, [x0, #320] // .....e........................................................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - ldr q13, [x0, #384] // ......e....................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - ldr q3, [x0, #448] // .......e...................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. + sub v19.8H, v27.8H, v23.8H // ........*............................................................................... + // gap // ........................................................................................ + add v23.8H, v27.8H, v23.8H // .........*.............................................................................. + // gap // ........................................................................................ + sub v22.8H, v22.8H, v28.8H // .............*.......................................................................... + // gap // ........................................................................................ + mul v28.8H, v19.8H, v0.H[6] // ..........*............................................................................. + // gap // ........................................................................................ + sqrdmulh v19.8H, v19.8H, v0.H[7] // ...........*............................................................................ + // gap // ........................................................................................ + sub v27.8H, v23.8H, v24.8H // ............................*........................................................... + // gap // ........................................................................................ + add v23.8H, v23.8H, v24.8H // .............................*.......................................................... + // gap // ........................................................................................ + mul v24.8H, v22.8H, v1.H[0] // ...............*........................................................................ + // gap // ........................................................................................ + sqrdmulh v22.8H, v22.8H, v1.H[1] // ................*....................................................................... + // gap // ........................................................................................ + mls v28.8H, v19.8H, v7.H[0] // ............*........................................................................... + // gap // ........................................................................................ + sub v19.8H, v3.8H, v26.8H // ..................*..................................................................... + // gap // ........................................................................................ + mul v3.8H, v27.8H, v0.H[2] // ..............................*......................................................... + // gap // ........................................................................................ + sqrdmulh v27.8H, v27.8H, v0.H[3] // ...............................*........................................................ + // gap // ........................................................................................ + sub v26.8H, v23.8H, v14.8H // ................................................*....................................... + // gap // ........................................................................................ + add v23.8H, v23.8H, v14.8H // .................................................*...................................... + // gap // ........................................................................................ + mls v24.8H, v22.8H, v7.H[0] // .................*...................................................................... + // gap // ........................................................................................ + mul v22.8H, v19.8H, v1.H[2] // ....................*................................................................... + // gap // ........................................................................................ + sqrdmulh v19.8H, v19.8H, v1.H[3] // .....................*.................................................................. + // gap // ........................................................................................ + sub v20.8H, v20.8H, v11.8H // .......................*................................................................ + // gap // ........................................................................................ + sub v11.8H, v28.8H, v24.8H // .................................*...................................................... + // gap // ........................................................................................ + add v28.8H, v28.8H, v24.8H // ..................................*..................................................... + // gap // ........................................................................................ + mls v22.8H, v19.8H, v7.H[0] // ......................*................................................................. + // gap // ........................................................................................ + mul v19.8H, v20.8H, v1.H[4] // .........................*.............................................................. + // gap // ........................................................................................ + mls v3.8H, v27.8H, v7.H[0] // ................................*....................................................... + // gap // ........................................................................................ + sqrdmulh v27.8H, v20.8H, v1.H[5] // ..........................*............................................................. + // gap // ........................................................................................ + mul v24.8H, v11.8H, v0.H[2] // ...................................*.................................................... + // gap // ........................................................................................ + sqrdmulh v20.8H, v11.8H, v0.H[3] // ....................................*................................................... + // gap // ........................................................................................ + mul v4.8H, v26.8H, v0.H[0] // ..................................................*..................................... + // gap // ........................................................................................ + sqrdmulh v11.8H, v26.8H, v0.H[1] // ...................................................*.................................... + // gap // ........................................................................................ + mul v26.8H, v23.8H, v29.8H // ........................................................................*............... + // gap // ........................................................................................ + sqrdmulh v6.8H, v23.8H, v30.8H // .........................................................................*.............. + // gap // ........................................................................................ + mls v19.8H, v27.8H, v7.H[0] // ...........................*............................................................ + // gap // ........................................................................................ + mls v24.8H, v20.8H, v7.H[0] // .....................................*.................................................. + // gap // ........................................................................................ + sub v27.8H, v25.8H, v5.8H // ......................................*................................................. + // gap // ........................................................................................ + mls v4.8H, v11.8H, v7.H[0] // ....................................................*................................... + // gap // ........................................................................................ + sub v23.8H, v22.8H, v19.8H // ...........................................*............................................ + // gap // ........................................................................................ + mul v20.8H, v27.8H, v0.H[4] // ........................................*............................................... + // gap // ........................................................................................ + sqrdmulh v11.8H, v27.8H, v0.H[5] // .........................................*.............................................. + // gap // ........................................................................................ + add v27.8H, v22.8H, v19.8H // ............................................*........................................... + // gap // ........................................................................................ + mul v22.8H, v23.8H, v0.H[4] // .............................................*.......................................... + // gap // ........................................................................................ + sqrdmulh v23.8H, v23.8H, v0.H[5] // ..............................................*......................................... + // gap // ........................................................................................ + sub v19.8H, v28.8H, v27.8H // .....................................................*.................................. + // gap // ........................................................................................ + add v10.8H, v28.8H, v27.8H // ......................................................*................................. + // gap // ........................................................................................ + mls v20.8H, v11.8H, v7.H[0] // ..........................................*............................................. + // gap // ........................................................................................ + mls v22.8H, v23.8H, v7.H[0] // ...............................................*........................................ + // gap // ........................................................................................ + mul v28.8H, v19.8H, v0.H[0] // .......................................................*................................ + // gap // ........................................................................................ + sqrdmulh v23.8H, v19.8H, v0.H[1] // ........................................................*............................... + // gap // ........................................................................................ + sub v14.8H, v3.8H, v20.8H // ..........................................................*............................. + // gap // ........................................................................................ + add v27.8H, v3.8H, v20.8H // ...........................................................*............................ + // gap // ........................................................................................ + ldr q3, [x0, #272] // ....e................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v20.8H, v24.8H, v22.8H // ...............................................................*........................ + // gap // ........................................................................................ + add v21.8H, v24.8H, v22.8H // ................................................................*....................... + // gap // ........................................................................................ + mls v26.8H, v6.8H, v7.H[0] // ..........................................................................*............. + // gap // ........................................................................................ + mul v22.8H, v20.8H, v0.H[0] // .................................................................*...................... + // gap // ........................................................................................ + sqrdmulh v19.8H, v20.8H, v0.H[1] // ..................................................................*..................... + // gap // ........................................................................................ + mls v28.8H, v23.8H, v7.H[0] // .........................................................*.............................. + // gap // ........................................................................................ + ldr q11, [x0, #464] // .......e................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v22.8H, v19.8H, v7.H[0] // ...................................................................*.................... + // gap // ........................................................................................ + str q28, [x0, #320] // .....................................................................*.................. + // gap // ........................................................................................ + mul v23.8H, v10.8H, v29.8H // ...........................................................................*............ + // gap // ........................................................................................ + sqrdmulh v19.8H, v10.8H, v30.8H // ............................................................................*........... + // gap // ........................................................................................ + str q22, [x0, #448] // .......................................................................*................ + // gap // ........................................................................................ + mul v28.8H, v27.8H, v29.8H // ..............................................................................*......... + // gap // ........................................................................................ + sqrdmulh v20.8H, v27.8H, v30.8H // ...............................................................................*........ + // gap // ........................................................................................ + str q26, [x0], #(16) // ....................................................................................*... + // gap // ........................................................................................ + mls v23.8H, v19.8H, v7.H[0] // .............................................................................*.......... + // gap // ........................................................................................ + mul v19.8H, v21.8H, v29.8H // .................................................................................*...... + // gap // ........................................................................................ + sqrdmulh v22.8H, v21.8H, v30.8H // ..................................................................................*..... + // gap // ........................................................................................ + mls v28.8H, v20.8H, v7.H[0] // ................................................................................*....... + // gap // ........................................................................................ + ldr q20, [x0, #384] // ......e................................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q26, [x0, #320] // .....e.................................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q23, [x0, #48] // .....................................................................................*.. + // gap // ........................................................................................ + ldr q23, [x0, #64] // .e...................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v19.8H, v22.8H, v7.H[0] // ...................................................................................*.... + // gap // ........................................................................................ + str q28, [x0, #112] // ......................................................................................*. + // gap // ........................................................................................ + ldr q28, [x0, #192] // ...e.................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q19, [x0, #176] // .......................................................................................* + // gap // ........................................................................................ + ldr q22, [x0, #128] // ..e..................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v8.8H, v14.8H, v0.H[0] // ............................................................*........................... + // gap // ........................................................................................ + sqrdmulh v27.8H, v14.8H, v0.H[1] // .............................................................*.......................... + // gap // ........................................................................................ + add v24.8H, v22.8H, v28.8H // ..............e......................................................................... + // gap // ........................................................................................ + str q4, [x0, #240] // ....................................................................*................... + // gap // ........................................................................................ + add v25.8H, v3.8H, v26.8H // ...................e.................................................................... + // gap // ........................................................................................ + mls v8.8H, v27.8H, v7.H[0] // ..............................................................*......................... + // gap // ........................................................................................ + add v5.8H, v20.8H, v11.8H // ........................e............................................................... + // gap // ........................................................................................ + ldr q27, [x0, #0] // e....................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q8, [x0, #368] // ......................................................................*................. + // gap // ........................................................................................ + add v14.8H, v25.8H, v5.8H // .......................................e................................................ + // gap // ........................................................................................ // original source code - // ldr q8, [x0, #0] // ...........*......................................................................................... - // ldr q9, [x0, #(1*(512/8))] // e..........|..................................................................................e...... - // ldr q10, [x0, #(2*(512/8))] // ...e.......|.....................................................................................e... - // ldr q11, [x0, #(3*(512/8))] // .....e.....|.......................................................................................e. - // ldr q12, [x0, #(4*(512/8))] // .......e...|......................................................................................... - // ldr q13, [x0, #(5*(512/8))] // ........e..|......................................................................................... - // ldr q14, [x0, #(6*(512/8))] // .........e.|......................................................................................... - // ldr q15, [x0, #(7*(512/8))] // ..........e|......................................................................................... - // sub v24.8h, v8.8h, v9.8h // ...........|..*...................................................................................... - // add v8.8h, v8.8h, v9.8h // ...........|...*..................................................................................... - // mul v9.8h, v24.8h, v0.h[6] // ...........|........*................................................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[7] // ...........|.........*............................................................................... - // mls v9.8h, v24.8h, v7.h[0] // ...........|.............*........................................................................... - // sub v24.8h, v10.8h, v11.8h // ...........|*........................................................................................ - // add v10.8h, v10.8h, v11.8h // ...........|.*....................................................................................... - // mul v11.8h, v24.8h, v1.h[0] // ...........|....*.................................................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[1] // ...........|.....*................................................................................... - // mls v11.8h, v24.8h, v7.h[0] // ...........|..........*.............................................................................. - // sub v24.8h, v12.8h, v13.8h // ...........|...........*............................................................................. - // add v12.8h, v12.8h, v13.8h // ...........|............*............................................................................ - // mul v13.8h, v24.8h, v1.h[2] // ...........|..............*.......................................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[3] // ...........|..................*...................................................................... - // mls v13.8h, v24.8h, v7.h[0] // ...........|......................*.................................................................. - // sub v24.8h, v14.8h, v15.8h // ...........|...................*..................................................................... - // add v14.8h, v14.8h, v15.8h // ...........|....................*.................................................................... - // mul v15.8h, v24.8h, v1.h[4] // ...........|..........................*.............................................................. - // sqrdmulh v24.8h, v24.8h, v1.h[5] // ...........|.............................*........................................................... - // mls v15.8h, v24.8h, v7.h[0] // ...........|.................................*....................................................... - // sub v24.8h, v8.8h, v10.8h // ...........|......*.................................................................................. - // add v8.8h, v8.8h, v10.8h // ...........|.......*................................................................................. - // mul v10.8h, v24.8h, v0.h[2] // ...........|...............*......................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........|................*........................................................................ - // mls v10.8h, v24.8h, v7.h[0] // ...........|..............................*.......................................................... - // sub v24.8h, v9.8h, v11.8h // ...........|.......................*................................................................. - // add v9.8h, v9.8h, v11.8h // ...........|.........................*............................................................... - // mul v11.8h, v24.8h, v0.h[2] // ...........|...............................*......................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........|................................*........................................................ - // mls v11.8h, v24.8h, v7.h[0] // ...........|....................................*.................................................... - // sub v24.8h, v12.8h, v14.8h // ...........|...........................*............................................................. - // add v12.8h, v12.8h, v14.8h // ...........|............................*............................................................ - // mul v14.8h, v24.8h, v0.h[4] // ...........|..................................*...................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...........|...................................*..................................................... - // mls v14.8h, v24.8h, v7.h[0] // ...........|.......................................*................................................. - // sub v24.8h, v13.8h, v15.8h // ...........|.....................................*................................................... - // add v13.8h, v13.8h, v15.8h // ...........|......................................*.................................................. - // mul v15.8h, v24.8h, v0.h[4] // ...........|........................................*................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...........|.........................................*............................................... - // mls v15.8h, v24.8h, v7.h[0] // ...........|.............................................*........................................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // ...........|.................*....................................................................... - // srshr v25.8h, v25.8h, #11 // ...........|.....................*................................................................... - // mls v8.8h, v25.8h, v7.h[0] // ...........|........................*................................................................ - // sqdmulh v25.8h, v12.8h, v7.h[1] // ...........|..........................................*.............................................. - // srshr v25.8h, v25.8h, #11 // ...........|..............................................*.......................................... - // mls v12.8h, v25.8h, v7.h[0] // ...........|.................................................*....................................... - // sub v24.8h, v8.8h, v12.8h // ...........|.....................................................*................................... - // add v8.8h, v8.8h, v12.8h // ...........|......................................................*.................................. - // mul v12.8h, v24.8h, v0.h[0] // ...........|........................................................*................................ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|.........................................................*............................... - // mls v12.8h, v24.8h, v7.h[0] // ...........|.............................................................*........................... - // sub v24.8h, v9.8h, v13.8h // ...........|...........................................*............................................. - // add v9.8h, v9.8h, v13.8h // ...........|............................................*............................................ - // mul v13.8h, v24.8h, v0.h[0] // ...........|...............................................*......................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|................................................*........................................ - // mls v13.8h, v24.8h, v7.h[0] // ...........|....................................................*.................................... - // sub v24.8h, v10.8h, v14.8h // ...........|..................................................*...................................... - // add v10.8h, v10.8h, v14.8h // ...........|...................................................*..................................... - // mul v14.8h, v24.8h, v0.h[0] // ...........|.......................................................*................................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|..........................................................*.............................. - // mls v14.8h, v24.8h, v7.h[0] // ...........|..............................................................*.......................... - // sub v24.8h, v11.8h, v15.8h // ...........|...........................................................*............................. - // add v11.8h, v11.8h, v15.8h // ...........|............................................................*............................ - // mul v15.8h, v24.8h, v0.h[0] // ...........|...............................................................*......................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|................................................................*........................ - // mls v15.8h, v24.8h, v7.h[0] // ...........|....................................................................*.................... - // str q12, [x0, #(4*(512/8))] // ...........|.................................................................*....................... - // str q13, [x0, #(5*(512/8))] // ...........|...................................................................*..................... - // str q14, [x0, #(6*(512/8))] // ...........|.....................................................................*................... - // str q15, [x0, #(7*(512/8))] // ...........|........................................................................*................ - // mul v12.8h, v8.8h, v29.8h // ...........|..................................................................*...................... - // sqrdmulh v8.8h, v8.8h, v30.8h // ...........|......................................................................*.................. - // mls v12.8h, v8.8h, v7.h[0] // ...........|..........................................................................*.............. - // mul v13.8h, v9.8h, v29.8h // ...........|.......................................................................*................. - // sqrdmulh v9.8h, v9.8h, v30.8h // ...........|.........................................................................*............... - // mls v13.8h, v9.8h, v7.h[0] // ...........|.............................................................................*........... - // mul v14.8h, v10.8h, v29.8h // ...........|...........................................................................*............. - // sqrdmulh v10.8h, v10.8h, v30.8h // ...........|............................................................................*............ - // mls v14.8h, v10.8h, v7.h[0] // ...........|................................................................................*........ - // mul v15.8h, v11.8h, v29.8h // ...........|..............................................................................*.......... - // sqrdmulh v11.8h, v11.8h, v30.8h // ...........|...............................................................................*......... - // mls v15.8h, v11.8h, v7.h[0] // .*.........|...................................................................................*..... - // str q12, [x0], #(16) // ...........|.................................................................................*....... - // str q13, [x0, #(-16 + 1*(512/8))] // ..*........|....................................................................................*.... - // str q14, [x0, #(-16 + 2*(512/8))] // ....*......|......................................................................................*.. - // str q15, [x0, #(-16 + 3*(512/8))] // ......*....|........................................................................................* + // ldr q8, [x0, #0] // ....................................e..|....................................................................................e. + // ldr q9, [x0, #(1*(512/8))] // .......................e...............|.......................................................................e.............. + // ldr q10, [x0, #(2*(512/8))] // ............................e..........|............................................................................e......... + // ldr q11, [x0, #(3*(512/8))] // ..........................e............|..........................................................................e........... + // ldr q12, [x0, #(4*(512/8))] // e......................................|................................................e..................................... + // ldr q13, [x0, #(5*(512/8))] // .....................e.................|.....................................................................e................ + // ldr q14, [x0, #(6*(512/8))] // ....................e..................|....................................................................e................. + // ldr q15, [x0, #(7*(512/8))] // .......e...............................|.......................................................e.............................. + // sub v24.8h, v8.8h, v9.8h // .......................................*...................................................................................... + // add v8.8h, v8.8h, v9.8h // .......................................|*..................................................................................... + // mul v9.8h, v24.8h, v0.h[6] // .......................................|..*................................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // .......................................|...*.................................................................................. + // mls v9.8h, v24.8h, v7.h[0] // .......................................|........*............................................................................. + // sub v24.8h, v10.8h, v11.8h // .......................................|.*.................................................................................... + // add v10.8h, v10.8h, v11.8h // ...............................e.......|...............................................................................e...... + // mul v11.8h, v24.8h, v1.h[0] // .......................................|......*............................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[1] // .......................................|.......*.............................................................................. + // mls v11.8h, v24.8h, v7.h[0] // .......................................|..............*....................................................................... + // sub v24.8h, v12.8h, v13.8h // .......................................|.........*............................................................................ + // add v12.8h, v12.8h, v13.8h // .................................e.....|.................................................................................e.... + // mul v13.8h, v24.8h, v1.h[2] // .......................................|...............*...................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // .......................................|................*..................................................................... + // mls v13.8h, v24.8h, v7.h[0] // .......................................|....................*................................................................. + // sub v24.8h, v14.8h, v15.8h // .......................................|.................*.................................................................... + // add v14.8h, v14.8h, v15.8h // ...................................e...|...................................................................................e.. + // mul v15.8h, v24.8h, v1.h[4] // .......................................|.....................*................................................................ + // sqrdmulh v24.8h, v24.8h, v1.h[5] // .......................................|.......................*.............................................................. + // mls v15.8h, v24.8h, v7.h[0] // .......................................|..............................*....................................................... + // sub v24.8h, v8.8h, v10.8h // .......................................|....*................................................................................. + // add v8.8h, v8.8h, v10.8h // .......................................|.....*................................................................................ + // mul v10.8h, v24.8h, v0.h[2] // .......................................|..........*........................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .......................................|...........*.......................................................................... + // mls v10.8h, v24.8h, v7.h[0] // .......................................|......................*............................................................... + // sub v24.8h, v9.8h, v11.8h // .......................................|..................*................................................................... + // add v9.8h, v9.8h, v11.8h // .......................................|...................*.................................................................. + // mul v11.8h, v24.8h, v0.h[2] // .......................................|........................*............................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .......................................|.........................*............................................................ + // mls v11.8h, v24.8h, v7.h[0] // .......................................|...............................*...................................................... + // sub v24.8h, v12.8h, v14.8h // .......................................|................................*..................................................... + // add v12.8h, v12.8h, v14.8h // ......................................e|...................................................................................... + // mul v14.8h, v24.8h, v0.h[4] // .......................................|...................................*.................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .......................................|....................................*................................................. + // mls v14.8h, v24.8h, v7.h[0] // .......................................|..........................................*........................................... + // sub v24.8h, v13.8h, v15.8h // .......................................|..................................*................................................... + // add v13.8h, v13.8h, v15.8h // .......................................|.....................................*................................................ + // mul v15.8h, v24.8h, v0.h[4] // .......................................|......................................*............................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .......................................|.......................................*.............................................. + // mls v15.8h, v24.8h, v7.h[0] // .......................................|...........................................*.......................................... + // sub v24.8h, v8.8h, v12.8h // .......................................|............*......................................................................... + // add v8.8h, v8.8h, v12.8h // .......................................|.............*........................................................................ + // mul v12.8h, v24.8h, v0.h[0] // .......................................|..........................*........................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................................|...........................*.......................................................... + // mls v12.8h, v24.8h, v7.h[0] // .......................................|.................................*.................................................... + // sub v24.8h, v9.8h, v13.8h // .......................................|........................................*............................................. + // add v9.8h, v9.8h, v13.8h // .......................................|.........................................*............................................ + // mul v13.8h, v24.8h, v0.h[0] // .......................................|............................................*......................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................................|.............................................*........................................ + // mls v13.8h, v24.8h, v7.h[0] // ......*................................|......................................................*............................... + // sub v24.8h, v10.8h, v14.8h // .......................................|..............................................*....................................... + // add v10.8h, v10.8h, v14.8h // .......................................|...............................................*...................................... + // mul v14.8h, v24.8h, v0.h[0] // .............................*.........|.............................................................................*........ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................*........|..............................................................................*....... + // mls v14.8h, v24.8h, v7.h[0] // ..................................*....|..................................................................................*... + // sub v24.8h, v11.8h, v15.8h // .*.....................................|.................................................*.................................... + // add v11.8h, v11.8h, v15.8h // ..*....................................|..................................................*................................... + // mul v15.8h, v24.8h, v0.h[0] // ....*..................................|....................................................*................................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .....*.................................|.....................................................*................................ + // mls v15.8h, v24.8h, v7.h[0] // ........*..............................|........................................................*............................. + // str q12, [x0, #(4*(512/8))] // ................................*......|................................................................................*..... + // str q13, [x0, #(5*(512/8))] // .........*.............................|.........................................................*............................ + // str q14, [x0, #(6*(512/8))] // .....................................*.|.....................................................................................* + // str q15, [x0, #(7*(512/8))] // ............*..........................|............................................................*......................... + // mul v12.8h, v8.8h, v29.8h // .......................................|............................*......................................................... + // sqrdmulh v8.8h, v8.8h, v30.8h // .......................................|.............................*........................................................ + // mls v12.8h, v8.8h, v7.h[0] // ...*...................................|...................................................*.................................. + // mul v13.8h, v9.8h, v29.8h // ..........*............................|..........................................................*........................... + // sqrdmulh v9.8h, v9.8h, v30.8h // ...........*...........................|...........................................................*.......................... + // mls v13.8h, v9.8h, v7.h[0] // ................*......................|................................................................*..................... + // mul v14.8h, v10.8h, v29.8h // .............*.........................|.............................................................*........................ + // sqrdmulh v10.8h, v10.8h, v30.8h // ..............*........................|..............................................................*....................... + // mls v14.8h, v10.8h, v7.h[0] // ...................*...................|...................................................................*.................. + // mul v15.8h, v11.8h, v29.8h // .................*.....................|.................................................................*.................... + // sqrdmulh v11.8h, v11.8h, v30.8h // ..................*....................|..................................................................*................... + // mls v15.8h, v11.8h, v7.h[0] // ........................*..............|........................................................................*............. + // str q12, [x0], #(16) // ...............*.......................|...............................................................*...................... + // str q13, [x0, #(-16 + 1*(512/8))] // ......................*................|......................................................................*............... + // str q14, [x0, #(-16 + 2*(512/8))] // .........................*.............|.........................................................................*............ + // str q15, [x0, #(-16 + 3*(512/8))] // ...........................*...........|...........................................................................*.......... sub count, count, #1 cbnz count, layer123_start - ldr q28, [x0, #0] // *...................................................................................... - // gap // ....................................................................................... - // gap // ....................................................................................... - // gap // ....................................................................................... - sub v5.8H, v23.8H, v21.8H // .*..................................................................................... - // gap // ....................................................................................... - add v6.8H, v23.8H, v21.8H // ..*.................................................................................... - // gap // ....................................................................................... - add v2.8H, v28.8H, v20.8H // ....*.................................................................................. - // gap // ....................................................................................... - mul v23.8H, v5.8H, v1.H[0] // .....*................................................................................. - // gap // ....................................................................................... - sub v11.8H, v28.8H, v20.8H // ...*................................................................................... - // gap // ....................................................................................... - add v8.8H, v2.8H, v6.8H // ........*.............................................................................. - // gap // ....................................................................................... - sub v16.8H, v2.8H, v6.8H // .......*............................................................................... - // gap // ....................................................................................... - sqrdmulh v24.8H, v11.8H, v0.H[7] // ..........*............................................................................ - // gap // ....................................................................................... - sqdmulh v18.8H, v8.8H, v7.H[1] // ..................*.................................................................... - // gap // ....................................................................................... - mul v21.8H, v16.8H, v0.H[2] // ................*...................................................................... - // gap // ....................................................................................... - sqrdmulh v25.8H, v16.8H, v0.H[3] // .................*..................................................................... - // gap // ....................................................................................... - sub v6.8H, v17.8H, v4.8H // ............*.......................................................................... - // gap // ....................................................................................... - sqrdmulh v2.8H, v5.8H, v1.H[1] // ......*................................................................................ - // gap // ....................................................................................... - sub v26.8H, v13.8H, v3.8H // ....................*.................................................................. - // gap // ....................................................................................... - mls v21.8H, v25.8H, v7.H[0] // ...............................*....................................................... - // gap // ....................................................................................... - mul v20.8H, v6.8H, v1.H[2] // ...............*....................................................................... - // gap // ....................................................................................... - mls v23.8H, v2.8H, v7.H[0] // ...........*........................................................................... - // gap // ....................................................................................... - sqrdmulh v2.8H, v6.8H, v1.H[3] // ...................*................................................................... - // gap // ....................................................................................... - sqrdmulh v16.8H, v26.8H, v1.H[5] // ..............................*........................................................ - // gap // ....................................................................................... - mul v25.8H, v26.8H, v1.H[4] // ...........................*........................................................... - // gap // ....................................................................................... - add v10.8H, v13.8H, v3.8H // .....................*................................................................. - // gap // ....................................................................................... - mls v20.8H, v2.8H, v7.H[0] // .......................*............................................................... - // gap // ....................................................................................... - add v19.8H, v17.8H, v4.8H // .............*......................................................................... - // gap // ....................................................................................... - mls v25.8H, v16.8H, v7.H[0] // ..................................*.................................................... - // gap // ....................................................................................... - mul v3.8H, v11.8H, v0.H[6] // .........*............................................................................. - // gap // ....................................................................................... - sub v2.8H, v19.8H, v10.8H // ............................*.......................................................... - // gap // ....................................................................................... - add v10.8H, v19.8H, v10.8H // .............................*......................................................... - // gap // ....................................................................................... - sub v16.8H, v20.8H, v25.8H // ......................................*................................................ - // gap // ....................................................................................... - sqrdmulh v17.8H, v2.8H, v0.H[5] // ....................................*.................................................. - // gap // ....................................................................................... - mul v11.8H, v2.8H, v0.H[4] // ...................................*................................................... - // gap // ....................................................................................... - mul v6.8H, v16.8H, v0.H[4] // .........................................*............................................. - // gap // ....................................................................................... - sqrdmulh v2.8H, v16.8H, v0.H[5] // ..........................................*............................................ - // gap // ....................................................................................... - sqdmulh v26.8H, v10.8H, v7.H[1] // ...........................................*........................................... - // gap // ....................................................................................... - mls v11.8H, v17.8H, v7.H[0] // ........................................*.............................................. - // gap // ....................................................................................... - srshr v16.8H, v18.8H, #11 // ......................*................................................................ - // gap // ....................................................................................... - mls v6.8H, v2.8H, v7.H[0] // ..............................................*........................................ - // gap // ....................................................................................... - srshr v2.8H, v26.8H, #11 // ...............................................*....................................... - // gap // ....................................................................................... - sub v17.8H, v21.8H, v11.8H // ...................................................*................................... - // gap // ....................................................................................... - mls v8.8H, v16.8H, v7.H[0] // .........................*............................................................. - // gap // ....................................................................................... - mls v10.8H, v2.8H, v7.H[0] // ..................................................*.................................... - // gap // ....................................................................................... - mls v3.8H, v24.8H, v7.H[0] // ..............*........................................................................ - // gap // ....................................................................................... - add v31.8H, v20.8H, v25.8H // .......................................*............................................... - // gap // ....................................................................................... - add v25.8H, v21.8H, v11.8H // ....................................................*.................................. - // gap // ....................................................................................... - add v13.8H, v8.8H, v10.8H // .......................................................*............................... - // gap // ....................................................................................... - sub v16.8H, v3.8H, v23.8H // ........................*.............................................................. - // gap // ....................................................................................... - sqrdmulh v2.8H, v25.8H, v30.8H // .............................................................................*......... - // gap // ....................................................................................... - mul v21.8H, v25.8H, v29.8H // ............................................................................*.......... - // gap // ....................................................................................... - sqrdmulh v26.8H, v16.8H, v0.H[3] // .................................*..................................................... - // gap // ....................................................................................... - mul v19.8H, v16.8H, v0.H[2] // ................................*...................................................... - // gap // ....................................................................................... - sub v16.8H, v8.8H, v10.8H // ......................................................*................................ - // gap // ....................................................................................... - mls v21.8H, v2.8H, v7.H[0] // .................................................................................*..... - // gap // ....................................................................................... - add v3.8H, v3.8H, v23.8H // ..........................*............................................................ - // gap // ....................................................................................... - mls v19.8H, v26.8H, v7.H[0] // .....................................*................................................. - // gap // ....................................................................................... - mul v25.8H, v16.8H, v0.H[0] // .........................................................*............................. - // gap // ....................................................................................... - str q21, [x0, #128] // .....................................................................................*. - // gap // ....................................................................................... - sqrdmulh v4.8H, v16.8H, v0.H[1] // ..........................................................*............................ - // gap // ....................................................................................... - sub v26.8H, v19.8H, v6.8H // ............................................................*.......................... - // gap // ....................................................................................... - add v20.8H, v19.8H, v6.8H // .............................................................*......................... - // gap // ....................................................................................... - mul v11.8H, v17.8H, v0.H[0] // ........................................................*.............................. - // gap // ....................................................................................... - sub v15.8H, v3.8H, v31.8H // ............................................*.......................................... - // gap // ....................................................................................... - sqrdmulh v2.8H, v20.8H, v30.8H // ................................................................................*...... - // gap // ....................................................................................... - mul v16.8H, v20.8H, v29.8H // ...............................................................................*....... - // gap // ....................................................................................... - sqrdmulh v23.8H, v15.8H, v0.H[1] // .................................................*..................................... - // gap // ....................................................................................... - mul v20.8H, v15.8H, v0.H[0] // ................................................*...................................... - // gap // ....................................................................................... - sqrdmulh v17.8H, v17.8H, v0.H[1] // ...........................................................*........................... - // gap // ....................................................................................... - mls v16.8H, v2.8H, v7.H[0] // ...................................................................................*... - // gap // ....................................................................................... - mul v21.8H, v13.8H, v29.8H // ...................................................................*................... - // gap // ....................................................................................... - mls v20.8H, v23.8H, v7.H[0] // .....................................................*................................. - // gap // ....................................................................................... - sqrdmulh v23.8H, v13.8H, v30.8H // .......................................................................*............... - // gap // ....................................................................................... - str q16, [x0, #192] // ......................................................................................* - // gap // ....................................................................................... - add v13.8H, v3.8H, v31.8H // .............................................*......................................... - // gap // ....................................................................................... - str q20, [x0, #320] // ....................................................................*.................. - // gap // ....................................................................................... - mls v21.8H, v23.8H, v7.H[0] // ...........................................................................*........... - // gap // ....................................................................................... - sqrdmulh v2.8H, v13.8H, v30.8H // ..........................................................................*............ - // gap // ....................................................................................... - sqrdmulh v23.8H, v26.8H, v0.H[1] // .................................................................*..................... - // gap // ....................................................................................... - mul v26.8H, v26.8H, v0.H[0] // ................................................................*...................... - // gap // ....................................................................................... - mul v16.8H, v13.8H, v29.8H // ........................................................................*.............. - // gap // ....................................................................................... - mls v11.8H, v17.8H, v7.H[0] // ...............................................................*....................... - // gap // ....................................................................................... - str q21, [x0], #(16) // ..................................................................................*.... - // gap // ....................................................................................... - mls v26.8H, v23.8H, v7.H[0] // .....................................................................*................. - // gap // ....................................................................................... - mls v25.8H, v4.8H, v7.H[0] // ..............................................................*........................ - // gap // ....................................................................................... - str q11, [x0, #368] // ......................................................................*................ - // gap // ....................................................................................... - mls v16.8H, v2.8H, v7.H[0] // ..............................................................................*........ - // gap // ....................................................................................... - str q26, [x0, #432] // .........................................................................*............. - // gap // ....................................................................................... - // gap // ....................................................................................... - // gap // ....................................................................................... - str q25, [x0, #240] // ..................................................................*.................... - // gap // ....................................................................................... - // gap // ....................................................................................... - // gap // ....................................................................................... - str q16, [x0, #48] // ....................................................................................*.. - // gap // ....................................................................................... + sub v10.8H, v22.8H, v28.8H // ..*......................................................................... + // gap // ............................................................................ + sub v18.8H, v27.8H, v23.8H // *........................................................................... + // gap // ............................................................................ + sub v19.8H, v20.8H, v11.8H // ..................*......................................................... + // gap // ............................................................................ + sqrdmulh v22.8H, v10.8H, v1.H[1] // ........*................................................................... + // gap // ............................................................................ + sqrdmulh v15.8H, v18.8H, v0.H[7] // ....*....................................................................... + // gap // ............................................................................ + mul v28.8H, v18.8H, v0.H[6] // ...*........................................................................ + // gap // ............................................................................ + mul v8.8H, v10.8H, v1.H[0] // .......*.................................................................... + // gap // ............................................................................ + sub v25.8H, v25.8H, v5.8H // .................................*.......................................... + // gap // ............................................................................ + mul v10.8H, v19.8H, v1.H[4] // ......................*..................................................... + // gap // ............................................................................ + mls v28.8H, v15.8H, v7.H[0] // .........*.................................................................. + // gap // ............................................................................ + mls v8.8H, v22.8H, v7.H[0] // ...............*............................................................ + // gap // ............................................................................ + sqrdmulh v22.8H, v19.8H, v1.H[5] // ........................*................................................... + // gap // ............................................................................ + sqrdmulh v19.8H, v25.8H, v0.H[5] // .....................................*...................................... + // gap // ............................................................................ + add v11.8H, v27.8H, v23.8H // .*.......................................................................... + // gap // ............................................................................ + sub v20.8H, v28.8H, v8.8H // ...................*........................................................ + // gap // ............................................................................ + mls v10.8H, v22.8H, v7.H[0] // ...............................*............................................ + // gap // ............................................................................ + add v17.8H, v11.8H, v24.8H // ......*..................................................................... + // gap // ............................................................................ + sqrdmulh v27.8H, v20.8H, v0.H[3] // ..........................*................................................. + // gap // ............................................................................ + mul v5.8H, v20.8H, v0.H[2] // .........................*.................................................. + // gap // ............................................................................ + sub v18.8H, v11.8H, v24.8H // .....*...................................................................... + // gap // ............................................................................ + add v23.8H, v17.8H, v14.8H // ..............*............................................................. + // gap // ............................................................................ + mul v16.8H, v25.8H, v0.H[4] // ....................................*....................................... + // gap // ............................................................................ + mls v5.8H, v27.8H, v7.H[0] // ................................*........................................... + // gap // ............................................................................ + sqrdmulh v20.8H, v23.8H, v30.8H // ..............................*............................................. + // gap // ............................................................................ + sqrdmulh v27.8H, v18.8H, v0.H[3] // ............*............................................................... + // gap // ............................................................................ + mul v22.8H, v18.8H, v0.H[2] // ...........*................................................................ + // gap // ............................................................................ + mul v25.8H, v23.8H, v29.8H // .............................*.............................................. + // gap // ............................................................................ + sub v26.8H, v3.8H, v26.8H // ..........*................................................................. + // gap // ............................................................................ + mls v16.8H, v19.8H, v7.H[0] // ...........................................*................................ + // gap // ............................................................................ + mls v22.8H, v27.8H, v7.H[0] // .......................*.................................................... + // gap // ............................................................................ + sqrdmulh v19.8H, v26.8H, v1.H[3] // .................*.......................................................... + // gap // ............................................................................ + mul v11.8H, v26.8H, v1.H[2] // ................*........................................................... + // gap // ............................................................................ + add v26.8H, v28.8H, v8.8H // ....................*....................................................... + // gap // ............................................................................ + sub v3.8H, v22.8H, v16.8H // ...............................................*............................ + // gap // ............................................................................ + add v4.8H, v22.8H, v16.8H // ................................................*........................... + // gap // ............................................................................ + mls v11.8H, v19.8H, v7.H[0] // .....................*...................................................... + // gap // ............................................................................ + mul v19.8H, v3.8H, v0.H[0] // .......................................................................*.... + // gap // ............................................................................ + sqrdmulh v9.8H, v3.8H, v0.H[1] // ........................................................................*... + // gap // ............................................................................ + sqrdmulh v23.8H, v4.8H, v30.8H // .............................................................*.............. + // gap // ............................................................................ + sub v31.8H, v11.8H, v10.8H // ...................................*........................................ + // gap // ............................................................................ + add v3.8H, v11.8H, v10.8H // ......................................*..................................... + // gap // ............................................................................ + mul v8.8H, v4.8H, v29.8H // ............................................................*............... + // gap // ............................................................................ + mul v24.8H, v31.8H, v0.H[4] // .......................................*.................................... + // gap // ............................................................................ + sub v28.8H, v26.8H, v3.8H // .........................................*.................................. + // gap // ............................................................................ + sqrdmulh v27.8H, v31.8H, v0.H[5] // ........................................*................................... + // gap // ............................................................................ + mls v8.8H, v23.8H, v7.H[0] // ..................................................................*......... + // gap // ............................................................................ + mul v22.8H, v28.8H, v0.H[0] // .............................................*.............................. + // gap // ............................................................................ + sqrdmulh v28.8H, v28.8H, v0.H[1] // ..............................................*............................. + // gap // ............................................................................ + mls v24.8H, v27.8H, v7.H[0] // ............................................*............................... + // gap // ............................................................................ + mls v25.8H, v20.8H, v7.H[0] // ...................................................*........................ + // gap // ............................................................................ + add v11.8H, v26.8H, v3.8H // ..........................................*................................. + // gap // ............................................................................ + mls v22.8H, v28.8H, v7.H[0] // ......................................................*..................... + // gap // ............................................................................ + sub v23.8H, v5.8H, v24.8H // .................................................*.......................... + // gap // ............................................................................ + mul v3.8H, v11.8H, v29.8H // .........................................................*.................. + // gap // ............................................................................ + add v28.8H, v5.8H, v24.8H // ..................................................*......................... + // gap // ............................................................................ + sqrdmulh v24.8H, v23.8H, v0.H[1] // .....................................................*...................... + // gap // ............................................................................ + mul v20.8H, v23.8H, v0.H[0] // ....................................................*....................... + // gap // ............................................................................ + mul v27.8H, v28.8H, v29.8H // ................................................................*........... + // gap // ............................................................................ + sqrdmulh v23.8H, v28.8H, v30.8H // .................................................................*.......... + // gap // ............................................................................ + str q22, [x0, #320] // ........................................................*................... + // gap // ............................................................................ + mls v20.8H, v24.8H, v7.H[0] // .......................................................*.................... + // gap // ............................................................................ + sub v22.8H, v17.8H, v14.8H // .............*.............................................................. + // gap // ............................................................................ + mls v27.8H, v23.8H, v7.H[0] // ....................................................................*....... + // gap // ............................................................................ + str q8, [x0, #128] // .....................................................................*...... + // gap // ............................................................................ + sqrdmulh v26.8H, v11.8H, v30.8H // ..........................................................*................. + // gap // ............................................................................ + str q20, [x0, #448] // ...........................................................*................ + // gap // ............................................................................ + sqrdmulh v28.8H, v22.8H, v0.H[1] // ............................*............................................... + // gap // ............................................................................ + mul v24.8H, v22.8H, v0.H[0] // ...........................*................................................ + // gap // ............................................................................ + str q27, [x0, #192] // ......................................................................*..... + // gap // ............................................................................ + mls v3.8H, v26.8H, v7.H[0] // ...............................................................*............ + // gap // ............................................................................ + str q25, [x0], #(16) // ..............................................................*............. + // gap // ............................................................................ + mls v24.8H, v28.8H, v7.H[0] // ..................................*......................................... + // gap // ............................................................................ + mls v19.8H, v9.8H, v7.H[0] // ..........................................................................*. + // gap // ............................................................................ + str q3, [x0, #48] // ...................................................................*........ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q24, [x0, #240] // .........................................................................*.. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q19, [x0, #368] // ...........................................................................* + // gap // ............................................................................ // original source code - // ldr q16, [x0, #0] // *...................................................................................... - // sub v2.8H, v23.8H, v21.8H // .*..................................................................................... - // add v23.8H, v23.8H, v21.8H // ..*.................................................................................... - // sub v21.8H, v16.8H, v20.8H // .....*................................................................................. - // add v16.8H, v16.8H, v20.8H // ...*................................................................................... - // mul v26.8H, v2.8H, v1.H[0] // ....*.................................................................................. - // sqrdmulh v2.8H, v2.8H, v1.H[1] // .............*......................................................................... - // sub v20.8H, v16.8H, v23.8H // .......*............................................................................... - // add v16.8H, v16.8H, v23.8H // ......*................................................................................ - // mul v23.8H, v21.8H, v0.H[6] // .........................*............................................................. - // sqrdmulh v21.8H, v21.8H, v0.H[7] // ........*.............................................................................. - // mls v26.8H, v2.8H, v7.H[0] // .................*..................................................................... - // sub v2.8H, v17.8H, v4.8H // ............*.......................................................................... - // add v17.8H, v17.8H, v4.8H // .......................*............................................................... - // mls v23.8H, v21.8H, v7.H[0] // .........................................*............................................. - // mul v21.8H, v2.8H, v1.H[2] // ................*...................................................................... - // mul v4.8H, v20.8H, v0.H[2] // ..........*............................................................................ - // sqrdmulh v20.8H, v20.8H, v0.H[3] // ...........*........................................................................... - // sqdmulh v25.8H, v16.8H, v7.H[1] // .........*............................................................................. - // sqrdmulh v2.8H, v2.8H, v1.H[3] // ..................*.................................................................... - // sub v11.8H, v13.8H, v3.8H // ..............*........................................................................ - // add v13.8H, v13.8H, v3.8H // .....................*................................................................. - // srshr v25.8H, v25.8H, #11 // ...................................*................................................... - // mls v21.8H, v2.8H, v7.H[0] // ......................*................................................................ - // sub v2.8H, v23.8H, v26.8H // .............................................*......................................... - // mls v16.8H, v25.8H, v7.H[0] // .......................................*............................................... - // add v23.8H, v23.8H, v26.8H // ....................................................*.................................. - // mul v26.8H, v11.8H, v1.H[4] // ....................*.................................................................. - // sub v25.8H, v17.8H, v13.8H // ..........................*............................................................ - // add v17.8H, v17.8H, v13.8H // ...........................*........................................................... - // sqrdmulh v11.8H, v11.8H, v1.H[5] // ...................*................................................................... - // mls v4.8H, v20.8H, v7.H[0] // ...............*....................................................................... - // mul v20.8H, v2.8H, v0.H[2] // .................................................*..................................... - // sqrdmulh v2.8H, v2.8H, v0.H[3] // ................................................*...................................... - // mls v26.8H, v11.8H, v7.H[0] // ........................*.............................................................. - // mul v11.8H, v25.8H, v0.H[4] // ..............................*........................................................ - // sqrdmulh v25.8H, v25.8H, v0.H[5] // .............................*......................................................... - // mls v20.8H, v2.8H, v7.H[0] // .....................................................*................................. - // sub v2.8H, v21.8H, v26.8H // ............................*.......................................................... - // add v21.8H, v21.8H, v26.8H // ..........................................*............................................ - // mls v11.8H, v25.8H, v7.H[0] // ..................................*.................................................... - // mul v26.8H, v2.8H, v0.H[4] // ...............................*....................................................... - // sqrdmulh v2.8H, v2.8H, v0.H[5] // ................................*...................................................... - // sqdmulh v25.8H, v17.8H, v7.H[1] // .................................*..................................................... - // sub v13.8H, v23.8H, v21.8H // ............................................................*.......................... - // add v23.8H, v23.8H, v21.8H // .......................................................................*............... - // mls v26.8H, v2.8H, v7.H[0] // ....................................*.................................................. - // srshr v2.8H, v25.8H, #11 // .....................................*................................................. - // mul v21.8H, v13.8H, v0.H[0] // ................................................................*...................... - // sqrdmulh v25.8H, v13.8H, v0.H[1] // ...............................................................*....................... - // mls v17.8H, v2.8H, v7.H[0] // ........................................*.............................................. - // sub v2.8H, v4.8H, v11.8H // ......................................*................................................ - // add v4.8H, v4.8H, v11.8H // ...........................................*........................................... - // mls v21.8H, v25.8H, v7.H[0] // ....................................................................*.................. - // sub v25.8H, v16.8H, v17.8H // ..................................................*.................................... - // add v16.8H, v16.8H, v17.8H // ............................................*.......................................... - // mul v17.8H, v2.8H, v0.H[0] // ...........................................................*........................... - // mul v11.8H, v25.8H, v0.H[0] // ......................................................*................................ - // sqrdmulh v25.8H, v25.8H, v0.H[1] // ........................................................*.............................. - // sqrdmulh v2.8H, v2.8H, v0.H[1] // .................................................................*..................... - // sub v13.8H, v20.8H, v26.8H // .........................................................*............................. - // add v26.8H, v20.8H, v26.8H // ..........................................................*............................ - // mls v11.8H, v25.8H, v7.H[0] // .................................................................................*..... - // mls v17.8H, v2.8H, v7.H[0] // ..............................................................................*........ - // mul v2.8H, v13.8H, v0.H[0] // ............................................................................*.......... - // sqrdmulh v20.8H, v13.8H, v0.H[1] // ...........................................................................*........... - // str q11, [x0, #256] // .....................................................................................*. - // mul v25.8H, v16.8H, v29.8H // ...................................................................*................... - // str q21, [x0, #320] // ........................................................................*.............. - // mls v2.8H, v20.8H, v7.H[0] // ................................................................................*...... - // str q17, [x0, #384] // ..................................................................................*.... - // sqrdmulh v16.8H, v16.8H, v30.8H // .....................................................................*................. - // mul v21.8H, v23.8H, v29.8H // .............................................................................*......... - // str q2, [x0, #448] // ....................................................................................*.. - // sqrdmulh v2.8H, v23.8H, v30.8H // ..........................................................................*............ - // mls v25.8H, v16.8H, v7.H[0] // .........................................................................*............. - // mul v16.8H, v4.8H, v29.8H // ...............................................*....................................... - // sqrdmulh v23.8H, v4.8H, v30.8H // ..............................................*........................................ - // mls v21.8H, v2.8H, v7.H[0] // ...................................................................................*... - // mul v2.8H, v26.8H, v29.8H // ..............................................................*........................ - // sqrdmulh v26.8H, v26.8H, v30.8H // .............................................................*......................... - // mls v16.8H, v23.8H, v7.H[0] // ...................................................*................................... - // str q25, [x0], #(16) // ...............................................................................*....... - // mls v2.8H, v26.8H, v7.H[0] // ..................................................................*.................... - // str q21, [x0, #48] // ......................................................................................* - // str q16, [x0, #112] // .......................................................*............................... - // str q2, [x0, #176] // ......................................................................*................ + // sub v19.8H, v27.8H, v23.8H // .*.......................................................................... + // add v23.8H, v27.8H, v23.8H // .............*.............................................................. + // sub v22.8H, v22.8H, v28.8H // *........................................................................... + // mul v28.8H, v19.8H, v0.H[6] // .....*...................................................................... + // sqrdmulh v19.8H, v19.8H, v0.H[7] // ....*....................................................................... + // sub v27.8H, v23.8H, v24.8H // ...................*........................................................ + // add v23.8H, v23.8H, v24.8H // ................*........................................................... + // mul v24.8H, v22.8H, v1.H[0] // ......*..................................................................... + // sqrdmulh v22.8H, v22.8H, v1.H[1] // ...*........................................................................ + // mls v28.8H, v19.8H, v7.H[0] // .........*.................................................................. + // sub v19.8H, v3.8H, v26.8H // ...........................*................................................ + // mul v3.8H, v27.8H, v0.H[2] // .........................*.................................................. + // sqrdmulh v27.8H, v27.8H, v0.H[3] // ........................*................................................... + // sub v26.8H, v23.8H, v14.8H // .............................................................*.............. + // add v23.8H, v23.8H, v14.8H // ....................*....................................................... + // mls v24.8H, v22.8H, v7.H[0] // ..........*................................................................. + // mul v22.8H, v19.8H, v1.H[2] // ...............................*............................................ + // sqrdmulh v19.8H, v19.8H, v1.H[3] // ..............................*............................................. + // sub v20.8H, v20.8H, v11.8H // ..*......................................................................... + // sub v11.8H, v28.8H, v24.8H // ..............*............................................................. + // add v28.8H, v28.8H, v24.8H // ................................*........................................... + // mls v22.8H, v19.8H, v7.H[0] // ...................................*........................................ + // mul v19.8H, v20.8H, v1.H[4] // ........*................................................................... + // mls v3.8H, v27.8H, v7.H[0] // .............................*.............................................. + // sqrdmulh v27.8H, v20.8H, v1.H[5] // ...........*................................................................ + // mul v24.8H, v11.8H, v0.H[2] // ..................*......................................................... + // sqrdmulh v20.8H, v11.8H, v0.H[3] // .................*.......................................................... + // mul v4.8H, v26.8H, v0.H[0] // ...................................................................*........ + // sqrdmulh v11.8H, v26.8H, v0.H[1] // ..................................................................*......... + // mul v26.8H, v23.8H, v29.8H // ..........................*................................................. + // sqrdmulh v6.8H, v23.8H, v30.8H // .......................*.................................................... + // mls v19.8H, v27.8H, v7.H[0] // ...............*............................................................ + // mls v24.8H, v20.8H, v7.H[0] // ......................*..................................................... + // sub v27.8H, v25.8H, v5.8H // .......*.................................................................... + // mls v4.8H, v11.8H, v7.H[0] // .......................................................................*.... + // sub v23.8H, v22.8H, v19.8H // .......................................*.................................... + // mul v20.8H, v27.8H, v0.H[4] // .....................*...................................................... + // sqrdmulh v11.8H, v27.8H, v0.H[5] // ............*............................................................... + // add v27.8H, v22.8H, v19.8H // ........................................*................................... + // mul v22.8H, v23.8H, v0.H[4] // ..........................................*................................. + // sqrdmulh v23.8H, v23.8H, v0.H[5] // ............................................*............................... + // sub v19.8H, v28.8H, v27.8H // ...........................................*................................ + // add v10.8H, v28.8H, v27.8H // ..................................................*......................... + // mls v20.8H, v11.8H, v7.H[0] // ............................*............................................... + // mls v22.8H, v23.8H, v7.H[0] // ................................................*........................... + // mul v28.8H, v19.8H, v0.H[0] // ..............................................*............................. + // sqrdmulh v23.8H, v19.8H, v0.H[1] // ...............................................*............................ + // sub v14.8H, v3.8H, v20.8H // .................................*.......................................... + // add v27.8H, v3.8H, v20.8H // ..................................*......................................... + // sub v20.8H, v24.8H, v22.8H // ....................................................*....................... + // add v21.8H, v24.8H, v22.8H // ......................................................*..................... + // mls v26.8H, v6.8H, v7.H[0] // .................................................*.......................... + // mul v22.8H, v20.8H, v0.H[0] // ........................................................*................... + // sqrdmulh v19.8H, v20.8H, v0.H[1] // .......................................................*.................... + // mls v28.8H, v23.8H, v7.H[0] // ...................................................*........................ + // mls v22.8H, v19.8H, v7.H[0] // ............................................................*............... + // str q28, [x0, #320] // ...........................................................*................ + // mul v23.8H, v10.8H, v29.8H // .....................................................*...................... + // sqrdmulh v19.8H, v10.8H, v30.8H // ................................................................*........... + // str q22, [x0, #448] // .................................................................*.......... + // mul v28.8H, v27.8H, v29.8H // .........................................*.................................. + // sqrdmulh v20.8H, v27.8H, v30.8H // ......................................*..................................... + // str q26, [x0], #(16) // ......................................................................*..... + // mls v23.8H, v19.8H, v7.H[0] // .....................................................................*...... + // mul v19.8H, v21.8H, v29.8H // .........................................................*.................. + // sqrdmulh v22.8H, v21.8H, v30.8H // ..........................................................*................. + // mls v28.8H, v20.8H, v7.H[0] // .............................................*.............................. + // str q23, [x0, #48] // .........................................................................*.. + // mls v19.8H, v22.8H, v7.H[0] // ..............................................................*............. + // str q28, [x0, #112] // ...............................................................*............ + // str q19, [x0, #176] // ....................................................................*....... + // mul v8.8H, v14.8H, v0.H[0] // ....................................*....................................... + // sqrdmulh v27.8H, v14.8H, v0.H[1] // .....................................*...................................... + // str q4, [x0, #240] // ..........................................................................*. + // mls v8.8H, v27.8H, v7.H[0] // ........................................................................*... + // str q8, [x0, #368] // ...........................................................................* pop_stack diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_a72.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_a72.s index 820a734..669bcee 100644 --- a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_a72.s +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_a72.s @@ -354,594 +354,657 @@ _intt_kyber_123_4567_manual_ld4_opt_a72: mov count, #8 .p2align 2 - ld4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1] // *.................................................... - ldr q10, [x4, #64] // .*................................................... - // gap // ..................................................... - ldr q6, [x4], #(6*16) // ...*................................................. - ldr q22, [x4, #-80] // ....*................................................ - // gap // ..................................................... - ldr q15, [x4, #-16] // .........*........................................... - // gap // ..................................................... - // gap // ..................................................... - ldr q4, [x3], #16 // ...............................*..................... - // gap // ..................................................... - // gap // ..................................................... - add v13.8H, v25.8H, v26.8H // ........*............................................ - ldr q1, [x4, #-48] // ..*.................................................. - sub v12.8H, v27.8H, v28.8H // ......*.............................................. - sub v8.8H, v25.8H, v26.8H // .....*............................................... - // gap // ..................................................... - // gap // ..................................................... - add v25.8H, v27.8H, v28.8H // ..........*.......................................... - // gap // ..................................................... - // gap // ..................................................... - sqrdmulh v21.8H, v12.8H, v15.8H // ...............*..................................... - ldr q15, [x4, #-64] // .......*............................................. - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - sqrdmulh v24.8H, v8.8H, v1.8H // ...........*......................................... - sub v18.8H, v13.8H, v25.8H // ..............*...................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mul v11.8H, v12.8H, v10.8H // .................*................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mul v0.8H, v8.8H, v15.8H // ............*........................................ - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mls v0.8H, v24.8H, v7.H[0] // ................*.................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mls v11.8H, v21.8H, v7.H[0] // ..................*.................................. - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - sqrdmulh v17.8H, v18.8H, v22.8H // ...................*................................. - add v15.8H, v13.8H, v25.8H // .............*....................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mul v5.8H, v18.8H, v6.8H // ....................*................................ - // gap // ..................................................... - // gap // ..................................................... - sub v29.8H, v0.8H, v11.8H // .....................*............................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - sqrdmulh v18.8H, v29.8H, v22.8H // ........................*............................ - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mul v29.8H, v29.8H, v6.8H // ...........................*......................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mls v5.8H, v17.8H, v7.H[0] // .......................*............................. - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mls v29.8H, v18.8H, v7.H[0] // ............................*........................ - // gap // ..................................................... - // gap // ..................................................... - add v19.8H, v0.8H, v11.8H // ......................*.............................. - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - trn2 v20.4S, v15.4S, v19.4S // ..........................*.......................... - trn1 v25.4S, v15.4S, v19.4S // .........................*........................... - // gap // ..................................................... - trn2 v17.4S, v5.4S, v29.4S // ..............................*...................... - trn1 v15.4S, v5.4S, v29.4S // .............................*....................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - trn1 v31.2D, v20.2D, v17.2D // ...................................*................. - trn1 v30.2D, v25.2D, v15.2D // ..................................*.................. - // gap // ..................................................... - trn2 v17.2D, v20.2D, v17.2D // .................................*................... - trn2 v12.2D, v25.2D, v15.2D // ................................*.................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - add v20.8H, v30.8H, v31.8H // .....................................*............... - sub v27.8H, v30.8H, v31.8H // ......................................*.............. - // gap // ..................................................... - add v21.8H, v12.8H, v17.8H // ....................................*................ - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - sqdmulh v18.8H, v20.8H, v7.H[1] // ........................................*............ - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - sqdmulh v2.8H, v21.8H, v7.H[1] // .......................................*............. - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - sqrdmulh v13.8H, v27.8H, v4.H[3] // ............................................*........ - // gap // ..................................................... - // gap // ..................................................... - srshr v15.8H, v18.8H, #11 // .............................................*....... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - srshr v22.8H, v2.8H, #11 // ...........................................*......... - // gap // ..................................................... - // gap // ..................................................... - mls v20.8H, v15.8H, v7.H[0] // ...............................................*..... - // gap // ..................................................... - // gap // ..................................................... - sub v23.8H, v12.8H, v17.8H // .........................................*........... - // gap // ..................................................... - // gap // ..................................................... - mls v21.8H, v22.8H, v7.H[0] // ..............................................*...... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mul v15.8H, v23.8H, v4.H[4] // ................................................*.... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mul v18.8H, v27.8H, v4.H[2] // ..........................................*.......... - // gap // ..................................................... - add v30.8H, v20.8H, v21.8H // ..................................................*.. - // gap // ..................................................... - // gap // ..................................................... - sqrdmulh v0.8H, v23.8H, v4.H[5] // .................................................*... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mls v18.8H, v13.8H, v7.H[0] // ...................................................*. - str q30, [x1], #(64) // ....................................................* - // gap // ..................................................... + ldr q2, [x4, #64] // ..........*......................................... + ld4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x1] // *................................................... + ldr q31, [x4, #32] // ............*....................................... + ldr q21, [x4, #16] // ....*............................................... + ldr q23, [x4, #48] // .*.................................................. + // gap // .................................................... + ldr q10, [x4, #80] // ..*................................................. + ldr q27, [x4], #(6*16) // ...*................................................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v12.8H, v17.8H, v18.8H // .......*............................................ + add v9.8H, v19.8H, v20.8H // ......*............................................. + // gap // .................................................... + sub v3.8H, v19.8H, v20.8H // .....*.............................................. + add v20.8H, v17.8H, v18.8H // ........*........................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mul v29.8H, v12.8H, v31.8H // ................*................................... + // gap // .................................................... + // gap // .................................................... + add v4.8H, v20.8H, v9.8H // ..............*..................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v15.8H, v12.8H, v23.8H // ...........*........................................ + sub v1.8H, v20.8H, v9.8H // .............*...................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v13.8H, v3.8H, v10.8H // .........*.......................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mul v28.8H, v3.8H, v2.8H // ...............*.................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v29.8H, v15.8H, v7.H[0] // .................*.................................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v28.8H, v13.8H, v7.H[0] // ..................*................................. + ldr q13, [x3], #16 // .........................................*.......... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mul v10.8H, v1.8H, v27.8H // ......................*............................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v17.8H, v29.8H, v28.8H // ....................*............................... + add v23.8H, v29.8H, v28.8H // .....................*.............................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v29.8H, v1.8H, v21.8H // ...................*................................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + trn1 v22.4S, v4.4S, v23.4S // ........................*........................... + sqrdmulh v18.8H, v17.8H, v21.8H // .......................*............................ + // gap // .................................................... + trn2 v1.4S, v4.4S, v23.4S // ..........................*......................... + // gap // .................................................... + // gap // .................................................... + mul v16.8H, v17.8H, v27.8H // .........................*.......................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v10.8H, v29.8H, v7.H[0] // ...........................*........................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v16.8H, v18.8H, v7.H[0] // ............................*....................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + trn2 v21.4S, v10.4S, v16.4S // .............................*...................... + trn1 v10.4S, v10.4S, v16.4S // ..............................*..................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + trn2 v14.2D, v1.2D, v21.2D // ..................................*................. + trn2 v6.2D, v22.2D, v10.2D // .................................*.................. + // gap // .................................................... + trn1 v17.2D, v22.2D, v10.2D // ................................*................... + trn1 v24.2D, v1.2D, v21.2D // ...............................*.................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + add v18.8H, v6.8H, v14.8H // .....................................*.............. + // gap // .................................................... + // gap // .................................................... + add v23.8H, v17.8H, v24.8H // ...................................*................ + sub v30.8H, v17.8H, v24.8H // ....................................*............... + // gap // .................................................... + sub v8.8H, v6.8H, v14.8H // .............................................*...... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqdmulh v4.8H, v18.8H, v7.H[1] // .......................................*............ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqdmulh v19.8H, v23.8H, v7.H[1] // ......................................*............. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + srshr v11.8H, v4.8H, #11 // ..........................................*......... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + srshr v17.8H, v19.8H, #11 // ........................................*........... + // gap // .................................................... + // gap // .................................................... + mls v18.8H, v11.8H, v7.H[0] // ............................................*....... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v23.8H, v17.8H, v7.H[0] // ...........................................*........ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v11.8H, v8.8H, v13.H[5] // .................................................*.. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v26.8H, v23.8H, v18.8H // ................................................*... + add v16.8H, v23.8H, v18.8H // ...............................................*.... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v23.8H, v30.8H, v13.H[3] // ...................................................* + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + str q16, [x1], #(64) // ..................................................*. + mul v12.8H, v8.8H, v13.H[4] // ..............................................*..... + // gap // .................................................... // original source code - // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // *.................................................... - // ldr q3, [x4, #64] // .*................................................... - // ldr q1, [x4, #48] // .......*............................................. - // ldr q29, [x4], #(6*16) // ..*.................................................. - // ldr q19, [x4, #-80] // ...*................................................. - // sub v5.8H, v9.8H, v10.8H // .........*........................................... - // sub v27.8H, v11.8H, v12.8H // ........*............................................ - // ldr q23, [x4, #-64] // ............*........................................ - // add v25.8H, v9.8H, v10.8H // ......*.............................................. - // ldr q24, [x4, #-16] // ....*................................................ - // add v0.8H, v11.8H, v12.8H // ..........*.......................................... - // sqrdmulh v17.8H, v5.8H, v1.8H // .............*....................................... - // mul v14.8H, v5.8H, v23.8H // ................*.................................... - // add v22.8H, v25.8H, v0.8H // ....................*................................ - // sub v30.8H, v25.8H, v0.8H // ..............*...................................... - // sqrdmulh v9.8H, v27.8H, v24.8H // ...........*......................................... - // mls v14.8H, v17.8H, v7.H[0] // .................*................................... - // mul v0.8H, v27.8H, v3.8H // ...............*..................................... - // mls v0.8H, v9.8H, v7.H[0] // ..................*.................................. - // sqrdmulh v31.8H, v30.8H, v19.8H // ...................*................................. - // mul v3.8H, v30.8H, v29.8H // .....................*............................... - // sub v8.8H, v14.8H, v0.8H // ......................*.............................. - // add v9.8H, v14.8H, v0.8H // ...........................*......................... - // mls v3.8H, v31.8H, v7.H[0] // .........................*........................... - // sqrdmulh v0.8H, v8.8H, v19.8H // .......................*............................. - // trn1 v11.4S, v22.4S, v9.4S // .............................*....................... - // trn2 v27.4S, v22.4S, v9.4S // ............................*........................ - // mul v28.8H, v8.8H, v29.8H // ........................*............................ - // mls v28.8H, v0.8H, v7.H[0] // ..........................*.......................... - // trn1 v21.4S, v3.4S, v28.4S // ...............................*..................... - // trn2 v15.4S, v3.4S, v28.4S // ..............................*...................... - // ldr q4, [x3], #16 // .....*............................................... - // trn2 v26.2D, v11.2D, v21.2D // ...................................*................. - // trn2 v25.2D, v27.2D, v15.2D // ..................................*.................. - // trn1 v30.2D, v11.2D, v21.2D // .................................*................... - // trn1 v17.2D, v27.2D, v15.2D // ................................*.................... - // add v21.8H, v26.8H, v25.8H // ......................................*.............. - // add v20.8H, v30.8H, v17.8H // ....................................*................ - // sub v16.8H, v30.8H, v17.8H // .....................................*............... - // sqdmulh v27.8H, v21.8H, v7.H[1] // ........................................*............ - // sqdmulh v23.8H, v20.8H, v7.H[1] // .......................................*............. - // sub v26.8H, v26.8H, v25.8H // .............................................*....... - // mul v18.8H, v16.8H, v4.H[2] // ................................................*.... - // srshr v2.8H, v27.8H, #11 // ...........................................*......... - // sqrdmulh v14.8H, v16.8H, v4.H[3] // .........................................*........... - // srshr v16.8H, v23.8H, #11 // ..........................................*.......... - // mls v21.8H, v2.8H, v7.H[0] // ..............................................*...... - // mls v20.8H, v16.8H, v7.H[0] // ............................................*........ - // mul v15.8H, v26.8H, v4.H[4] // ...............................................*..... - // sqrdmulh v0.8H, v26.8H, v4.H[5] // ..................................................*.. - // add v2.8H, v20.8H, v21.8H // .................................................*... - // mls v18.8H, v14.8H, v7.H[0] // ...................................................*. - // str q2, [x1], #(64) // ....................................................* + // ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // .*.................................................. + // ldr q27, [x4, #48] // ....*............................................... + // ldr q24, [x4, #80] // .....*.............................................. + // ldr q11, [x4], #(6*16) // ......*............................................. + // ldr q8, [x4, #-80] // ...*................................................ + // sub v28.8H, v5.8H, v6.8H // .........*.......................................... + // add v29.8H, v5.8H, v6.8H // ........*........................................... + // sub v6.8H, v3.8H, v4.8H // .......*............................................ + // add v0.8H, v3.8H, v4.8H // ..........*......................................... + // sqrdmulh v3.8H, v28.8H, v24.8H // ...............*.................................... + // ldr q18, [x4, #-32] // *................................................... + // sqrdmulh v25.8H, v6.8H, v27.8H // .............*...................................... + // ldr q27, [x4, #-64] // ..*................................................. + // sub v19.8H, v0.8H, v29.8H // ..............*..................................... + // add v15.8H, v0.8H, v29.8H // ............*....................................... + // mul v29.8H, v28.8H, v18.8H // ................*................................... + // mul v27.8H, v6.8H, v27.8H // ...........*........................................ + // mls v27.8H, v25.8H, v7.H[0] // .................*.................................. + // mls v29.8H, v3.8H, v7.H[0] // ..................*................................. + // sqrdmulh v18.8H, v19.8H, v8.8H // .......................*............................ + // sub v10.8H, v27.8H, v29.8H // .....................*.............................. + // add v24.8H, v27.8H, v29.8H // ......................*............................. + // mul v0.8H, v19.8H, v11.8H // ....................*............................... + // sqrdmulh v4.8H, v10.8H, v8.8H // .........................*.......................... + // trn1 v9.4S, v15.4S, v24.4S // ........................*........................... + // mul v25.8H, v10.8H, v11.8H // ...........................*........................ + // trn2 v24.4S, v15.4S, v24.4S // ..........................*......................... + // mls v0.8H, v18.8H, v7.H[0] // ............................*....................... + // mls v25.8H, v4.8H, v7.H[0] // .............................*...................... + // trn2 v11.4S, v0.4S, v25.4S // ..............................*..................... + // trn1 v27.4S, v0.4S, v25.4S // ...............................*.................... + // trn1 v20.2D, v24.2D, v11.2D // ...................................*................ + // trn1 v30.2D, v9.2D, v27.2D // ..................................*................. + // trn2 v15.2D, v9.2D, v27.2D // .................................*.................. + // trn2 v16.2D, v24.2D, v11.2D // ................................*................... + // add v29.8H, v30.8H, v20.8H // .....................................*.............. + // sub v30.8H, v30.8H, v20.8H // ......................................*............. + // add v25.8H, v15.8H, v16.8H // ....................................*............... + // sqdmulh v24.8H, v29.8H, v7.H[1] // .........................................*.......... + // sqdmulh v26.8H, v25.8H, v7.H[1] // ........................................*........... + // srshr v24.8H, v24.8H, #11 // ...........................................*........ + // ldr q13, [x3], #16 // ...................*................................ + // srshr v23.8H, v26.8H, #11 // ..........................................*......... + // mls v29.8H, v24.8H, v7.H[0] // .............................................*...... + // mls v25.8H, v23.8H, v7.H[0] // ............................................*....... + // sub v14.8H, v15.8H, v16.8H // .......................................*............ + // mul v12.8H, v14.8H, v13.H[4] // ...................................................* + // add v28.8H, v29.8H, v25.8H // ................................................*... + // sub v26.8H, v29.8H, v25.8H // ...............................................*.... + // sqrdmulh v11.8H, v14.8H, v13.H[5] // ..............................................*..... + // str q28, [x1], #(64) // ..................................................*. + // sqrdmulh v23.8H, v30.8H, v13.H[3] // .................................................*.. sub count, count, #1 layer4567_start: - ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // e................................................................. - sub v2.8H, v20.8H, v21.8H // ....................................................*............. - // gap // .................................................................. - ldr q3, [x4, #64] // .....e............................................................ - mls v15.8H, v0.8H, v7.H[0] // .............................................*.................... - // gap // .................................................................. - // gap // .................................................................. - ldr q1, [x4, #48] // ....e............................................................. - // gap // .................................................................. - ldr q29, [x4], #(6*16) // .e................................................................ - sqrdmulh v16.8H, v2.8H, v4.H[1] // .......................................................*.......... - // gap // .................................................................. - ldr q19, [x4, #-80] // ..e............................................................... - sub v5.8H, v9.8H, v10.8H // .......e.......................................................... - // gap // .................................................................. - sub v27.8H, v11.8H, v12.8H // ............e..................................................... - ldr q23, [x4, #-64] // ...e.............................................................. - mul v2.8H, v2.8H, v4.H[0] // ......................................................*........... - add v25.8H, v9.8H, v10.8H // ........e......................................................... - ldr q24, [x4, #-16] // ......e........................................................... - // gap // .................................................................. - add v0.8H, v11.8H, v12.8H // .............e.................................................... - sqrdmulh v17.8H, v5.8H, v1.8H // ..........e....................................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mul v14.8H, v5.8H, v23.8H // .........e........................................................ - // gap // .................................................................. - // gap // .................................................................. - add v22.8H, v25.8H, v0.8H // ..................e............................................... - // gap // .................................................................. - // gap // .................................................................. - sub v30.8H, v25.8H, v0.8H // .................e................................................ - sqrdmulh v9.8H, v27.8H, v24.8H // ...............e.................................................. - // gap // .................................................................. - sub v23.8H, v18.8H, v15.8H // .........................................................*........ - // gap // .................................................................. - // gap // .................................................................. - mls v14.8H, v17.8H, v7.H[0] // ...........e...................................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mul v0.8H, v27.8H, v3.8H // ..............e................................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mls v0.8H, v9.8H, v7.H[0] // ................e................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sqrdmulh v31.8H, v30.8H, v19.8H // ....................e............................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mul v3.8H, v30.8H, v29.8H // ...................e.............................................. - // gap // .................................................................. - // gap // .................................................................. - sub v8.8H, v14.8H, v0.8H // ......................e........................................... - // gap // .................................................................. - // gap // .................................................................. - add v9.8H, v14.8H, v0.8H // .......................e.......................................... - // gap // .................................................................. - // gap // .................................................................. - mls v3.8H, v31.8H, v7.H[0] // .....................e............................................ - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sqrdmulh v0.8H, v8.8H, v19.8H // .........................e........................................ - trn1 v11.4S, v22.4S, v9.4S // ...........................e...................................... - // gap // .................................................................. - trn2 v27.4S, v22.4S, v9.4S // ............................e..................................... - // gap // .................................................................. - // gap // .................................................................. - mul v28.8H, v8.8H, v29.8H // ........................e......................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mls v28.8H, v0.8H, v7.H[0] // ..........................e....................................... - add v0.8H, v18.8H, v15.8H // ..........................................................*....... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sqrdmulh v10.8H, v23.8H, v4.H[1] // ............................................................*..... - // gap // .................................................................. - // gap // .................................................................. - str q0, [x1, #-48] // ...............................................................*.. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - trn1 v21.4S, v3.4S, v28.4S // .............................e.................................... - trn2 v15.4S, v3.4S, v28.4S // ..............................e................................... - // gap // .................................................................. - mul v0.8H, v23.8H, v4.H[0] // ...........................................................*...... - ldr q4, [x3], #16 // ...................................e.............................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - trn2 v26.2D, v11.2D, v21.2D // ...............................e.................................. - trn2 v25.2D, v27.2D, v15.2D // ................................e................................. - // gap // .................................................................. - trn1 v30.2D, v11.2D, v21.2D // .................................e................................ - mls v0.8H, v10.8H, v7.H[0] // .............................................................*.... - // gap // .................................................................. - trn1 v17.2D, v27.2D, v15.2D // ..................................e............................... - // gap // .................................................................. - // gap // .................................................................. - mls v2.8H, v16.8H, v7.H[0] // ........................................................*......... - add v21.8H, v26.8H, v25.8H // ..........................................e....................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - add v20.8H, v30.8H, v17.8H // .....................................e............................ - // gap // .................................................................. - // gap // .................................................................. - sub v16.8H, v30.8H, v17.8H // ....................................e............................. - sqdmulh v27.8H, v21.8H, v7.H[1] // .................................................e................ - str q0, [x1, #-16] // .................................................................* - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sqdmulh v23.8H, v20.8H, v7.H[1] // ..............................................e................... - sub v26.8H, v26.8H, v25.8H // .........................................e........................ - str q2, [x1, #-32] // ................................................................*. - // gap // .................................................................. - // gap // .................................................................. - mul v18.8H, v16.8H, v4.H[2] // ......................................e........................... - // gap // .................................................................. - // gap // .................................................................. - srshr v2.8H, v27.8H, #11 // ..................................................e............... - // gap // .................................................................. - // gap // .................................................................. - sqrdmulh v14.8H, v16.8H, v4.H[3] // .......................................e.......................... - // gap // .................................................................. - // gap // .................................................................. - srshr v16.8H, v23.8H, #11 // ...............................................e.................. - // gap // .................................................................. - // gap // .................................................................. - mls v21.8H, v2.8H, v7.H[0] // ...................................................e.............. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mls v20.8H, v16.8H, v7.H[0] // ................................................e................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mul v15.8H, v26.8H, v4.H[4] // ...........................................e...................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sqrdmulh v0.8H, v26.8H, v4.H[5] // ............................................e..................... - add v2.8H, v20.8H, v21.8H // .....................................................e............ - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mls v18.8H, v14.8H, v7.H[0] // ........................................e......................... - // gap // .................................................................. - // gap // .................................................................. - str q2, [x1], #(64) // ..............................................................e... - // gap // .................................................................. - // gap // .................................................................. + ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // e....................................................................... + ldr q27, [x4, #48] // ....e................................................................... + // gap // ........................................................................ + ldr q24, [x4, #80] // ......e................................................................. + mul v22.8H, v30.8H, v13.H[2] // ......................................*................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v12.8H, v11.8H, v7.H[0] // .............................................*.......................... + ldr q11, [x4], #(6*16) // .e...................................................................... + // gap // ........................................................................ + ldr q8, [x4, #-80] // ..e..................................................................... + sub v28.8H, v5.8H, v6.8H // ............e........................................................... + // gap // ........................................................................ + add v29.8H, v5.8H, v6.8H // .............e.......................................................... + mls v22.8H, v23.8H, v7.H[0] // ........................................*............................... + // gap // ........................................................................ + sub v6.8H, v3.8H, v4.8H // .......e................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v0.8H, v3.8H, v4.8H // ........e............................................................... + sqrdmulh v3.8H, v28.8H, v24.8H // ...............e........................................................ + ldr q18, [x4, #-32] // .....e.................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v25.8H, v6.8H, v27.8H // ..........e............................................................. + ldr q27, [x4, #-64] // ...e.................................................................... + // gap // ........................................................................ + sub v19.8H, v0.8H, v29.8H // .................e...................................................... + // gap // ........................................................................ + // gap // ........................................................................ + add v15.8H, v0.8H, v29.8H // ..................e..................................................... + mul v29.8H, v28.8H, v18.8H // ..............e......................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v27.8H, v6.8H, v27.8H // .........e.............................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v27.8H, v25.8H, v7.H[0] // ...........e............................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v29.8H, v3.8H, v7.H[0] // ................e....................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v28.8H, v12.8H, v7.H[1] // .......................................................*................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v18.8H, v19.8H, v8.8H // ....................e................................................... + // gap // ........................................................................ + // gap // ........................................................................ + sub v10.8H, v27.8H, v29.8H // ......................e................................................. + // gap // ........................................................................ + // gap // ........................................................................ + add v24.8H, v27.8H, v29.8H // .......................e................................................ + // gap // ........................................................................ + mul v0.8H, v19.8H, v11.8H // ...................e.................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v4.8H, v10.8H, v8.8H // .........................e.............................................. + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v9.4S, v15.4S, v24.4S // ...........................e............................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v25.8H, v10.8H, v11.8H // ........................e............................................... + trn2 v24.4S, v15.4S, v24.4S // ............................e........................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v0.8H, v18.8H, v7.H[0] // .....................e.................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v25.8H, v4.8H, v7.H[0] // ..........................e............................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v3.8H, v22.8H, v7.H[1] // .................................................*...................... + // gap // ........................................................................ + // gap // ........................................................................ + srshr v27.8H, v28.8H, #11 // ........................................................*............... + // gap // ........................................................................ + // gap // ........................................................................ + mul v14.8H, v26.8H, v13.H[0] // ............................................................*........... + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v11.4S, v0.4S, v25.4S // ..............................e......................................... + // gap // ........................................................................ + // gap // ........................................................................ + mls v12.8H, v27.8H, v7.H[0] // .........................................................*.............. + trn1 v27.4S, v0.4S, v25.4S // .............................e.......................................... + // gap // ........................................................................ + srshr v3.8H, v3.8H, #11 // ..................................................*..................... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v2.8H, v26.8H, v13.H[1] // .............................................................*.......... + trn1 v20.2D, v24.2D, v11.2D // ..................................e..................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v30.2D, v9.2D, v27.2D // .................................e...................................... + trn2 v15.2D, v9.2D, v27.2D // ...............................e........................................ + trn2 v16.2D, v24.2D, v11.2D // ................................e....................................... + // gap // ........................................................................ + mls v22.8H, v3.8H, v7.H[0] // ...................................................*.................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v29.8H, v30.8H, v20.8H // .....................................e.................................. + mls v14.8H, v2.8H, v7.H[0] // ..............................................................*......... + sub v30.8H, v30.8H, v20.8H // ....................................e................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v25.8H, v15.8H, v16.8H // ..........................................e............................. + sqdmulh v24.8H, v29.8H, v7.H[1] // ..............................................e......................... + // gap // ........................................................................ + // gap // ........................................................................ + sub v28.8H, v22.8H, v12.8H // ...............................................................*........ + // gap // ........................................................................ + // gap // ........................................................................ + add v11.8H, v22.8H, v12.8H // ................................................................*....... + // gap // ........................................................................ + sqdmulh v26.8H, v25.8H, v7.H[1] // ....................................................e................... + str q14, [x1, #-32] // ......................................................................*. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v0.8H, v28.8H, v13.H[1] // ..................................................................*..... + srshr v24.8H, v24.8H, #11 // ...............................................e........................ + str q11, [x1, #-48] // .....................................................................*.. + // gap // ........................................................................ + mul v10.8H, v28.8H, v13.H[0] // .................................................................*...... + ldr q13, [x3], #16 // ...................................e.................................... + // gap // ........................................................................ + srshr v23.8H, v26.8H, #11 // .....................................................e.................. + // gap // ........................................................................ + // gap // ........................................................................ + mls v29.8H, v24.8H, v7.H[0] // ................................................e....................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v25.8H, v23.8H, v7.H[0] // ......................................................e................. + // gap // ........................................................................ + // gap // ........................................................................ + sub v14.8H, v15.8H, v16.8H // .........................................e.............................. + // gap // ........................................................................ + // gap // ........................................................................ + mls v10.8H, v0.8H, v7.H[0] // ...................................................................*.... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v12.8H, v14.8H, v13.H[4] // ...........................................e............................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v28.8H, v29.8H, v25.8H // ...........................................................e............ + sub v26.8H, v29.8H, v25.8H // ..........................................................e............. + sqrdmulh v11.8H, v14.8H, v13.H[5] // ............................................e........................... + // gap // ........................................................................ + str q10, [x1, #-16] // .......................................................................* + // gap // ........................................................................ + // gap // ........................................................................ + str q28, [x1], #(64) // ....................................................................e... + // gap // ........................................................................ + sqrdmulh v23.8H, v30.8H, v13.H[3] // .......................................e................................ // original source code - // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // e.................................................................e...................................................... - // ldr q0, [x4], #(6*16) // .....e............................................................|....e................................................. - // ldr q4, [x4, #(-6*16 + 1*16)] // .......e..........................................................|......e............................................... - // ldr q1, [x4, #(-6*16 + 2*16)] // ..........e.......................................................|.........e............................................ - // ldr q5, [x4, #(-6*16 + 3*16)] // ....e.............................................................|...e.................................................. - // ldr q2, [x4, #(-6*16 + 4*16)] // ..e...............................................................|.e.................................................... - // ldr q6, [x4, #(-6*16 + 5*16)] // .............e....................................................|............e......................................... - // sub v24.8h, v8.8h, v9.8h // ........e.........................................................|.......e.............................................. - // add v8.8h, v8.8h, v9.8h // ............e.....................................................|...........e.......................................... - // mul v9.8h, v24.8h, v1.8h // ................e.................................................|...............e...................................... - // sqrdmulh v24.8h, v24.8h, v5.8h // ...............e..................................................|..............e....................................... - // mls v9.8h, v24.8h, v7.h[0] // .....................e............................................|....................e................................. - // sub v24.8h, v10.8h, v11.8h // .........e........................................................|........e............................................. - // add v10.8h, v10.8h, v11.8h // ..............e...................................................|.............e........................................ - // mul v11.8h, v24.8h, v2.8h // ......................e...........................................|.....................e................................ - // sqrdmulh v24.8h, v24.8h, v6.8h // ...................e..............................................|..................e................................... - // mls v11.8h, v24.8h, v7.h[0] // .......................e..........................................|......................e............................... - // sub v24.8h, v8.8h, v10.8h // ..................e...............................................|.................e.................................... - // add v8.8h, v8.8h, v10.8h // .................e................................................|................e..................................... - // mul v10.8h, v24.8h, v0.8h // .........................e........................................|........................e............................. - // sqrdmulh v24.8h, v24.8h, v4.8h // ........................e.........................................|.......................e.............................. - // mls v10.8h, v24.8h, v7.h[0] // ............................e.....................................|...........................e.......................... - // sub v24.8h, v9.8h, v11.8h // ..........................e.......................................|.........................e............................ - // add v9.8h, v9.8h, v11.8h // ...........................e......................................|..........................e........................... - // mul v11.8h, v24.8h, v0.8h // ................................e.................................|...............................e...................... - // sqrdmulh v24.8h, v24.8h, v4.8h // .............................e....................................|............................e......................... - // mls v11.8h, v24.8h, v7.h[0] // .................................e................................|................................e..................... - // trn1 v25.4s, v8.4s, v9.4s // ..............................e...................................|.............................e........................ - // trn2 v26.4s, v8.4s, v9.4s // ...............................e..................................|..............................e....................... - // trn1 v27.4s, v10.4s, v11.4s // .....................................e............................|....................................e................. - // trn2 v28.4s, v10.4s, v11.4s // ......................................e...........................|.....................................e................ - // trn2 v10.2d, v25.2d, v27.2d // .........................................e........................|........................................e............. - // trn2 v11.2d, v26.2d, v28.2d // ..........................................e.......................|.........................................e............ - // trn1 v8.2d, v25.2d, v27.2d // ...........................................e......................|..........................................e........... - // trn1 v9.2d, v26.2d, v28.2d // .............................................e....................|............................................e......... - // ldr q0, [x3], #16 // ........................................e.........................|.......................................e.............. - // sub v24.8h, v8.8h, v9.8h // .................................................e................|................................................e..... - // add v8.8h, v8.8h, v9.8h // ................................................e.................|...............................................e...... - // mul v9.8h, v24.8h, v0.h[2] // .......................................................e..........|...................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // .........................................................e........|...................................................... - // mls v9.8h, v24.8h, v7.h[0] // ................................................................e.|...................................................... - // sub v24.8h, v10.8h, v11.8h // .....................................................e............|....................................................e. - // add v10.8h, v10.8h, v11.8h // ...............................................e..................|..............................................e....... - // mul v11.8h, v24.8h, v0.h[4] // .............................................................e....|...................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..............................................................e...|...................................................... - // mls v11.8h, v24.8h, v7.h[0] // ...*..............................................................|..*................................................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // ....................................................e.............|...................................................e.. - // srshr v25.8h, v25.8h, #11 // ..........................................................e.......|...................................................... - // mls v8.8h, v25.8h, v7.h[0] // ............................................................e.....|...................................................... - // sqdmulh v25.8h, v10.8h, v7.h[1] // ..................................................e...............|.................................................e.... - // srshr v25.8h, v25.8h, #11 // ........................................................e.........|...................................................... - // mls v10.8h, v25.8h, v7.h[0] // ...........................................................e......|...................................................... - // sub v24.8h, v8.8h, v10.8h // .*................................................................|*..................................................... - // add v8.8h, v8.8h, v10.8h // ...............................................................e..|...................................................... - // mul v10.8h, v24.8h, v0.h[0] // ...........*......................................................|..........*........................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ......*...........................................................|.....*................................................ - // mls v10.8h, v24.8h, v7.h[0] // ..............................................*...................|.............................................*........ - // sub v24.8h, v9.8h, v11.8h // ....................*.............................................|...................*.................................. - // add v9.8h, v9.8h, v11.8h // ..................................*...............................|.................................*.................... - // mul v11.8h, v24.8h, v0.h[0] // .......................................*..........................|......................................*............... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...................................*..............................|..................................*................... - // mls v11.8h, v24.8h, v7.h[0] // ............................................*.....................|...........................................*.......... - // str q8, [x1], #(64) // .................................................................e|...................................................... - // str q9, [x1, #(-64 + 16*1)] // ....................................*.............................|...................................*.................. - // str q10, [x1, #(-64 + 16*2)] // ......................................................*...........|.....................................................* - // str q11, [x1, #(-64 + 16*3)] // ...................................................*..............|..................................................*... + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // e.......................................................................e..................................................................... + // ldr q0, [x4], #(6*16) // .....e..................................................................|....e................................................................ + // ldr q4, [x4, #(-6*16 + 1*16)] // ......e.................................................................|.....e............................................................... + // ldr q1, [x4, #(-6*16 + 2*16)] // ...............e........................................................|..............e...................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // .e......................................................................|e.................................................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // .............e..........................................................|............e........................................................ + // ldr q6, [x4, #(-6*16 + 5*16)] // ..e.....................................................................|.e................................................................... + // sub v24.8h, v8.8h, v9.8h // ..........e.............................................................|.........e........................................................... + // add v8.8h, v8.8h, v9.8h // ...........e............................................................|..........e.......................................................... + // mul v9.8h, v24.8h, v1.8h // ...................e....................................................|..................e.................................................. + // sqrdmulh v24.8h, v24.8h, v5.8h // ..............e.........................................................|.............e....................................................... + // mls v9.8h, v24.8h, v7.h[0] // ....................e...................................................|...................e................................................. + // sub v24.8h, v10.8h, v11.8h // .......e................................................................|......e.............................................................. + // add v10.8h, v10.8h, v11.8h // ........e...............................................................|.......e............................................................. + // mul v11.8h, v24.8h, v2.8h // ..................e.....................................................|.................e................................................... + // sqrdmulh v24.8h, v24.8h, v6.8h // ............e...........................................................|...........e......................................................... + // mls v11.8h, v24.8h, v7.h[0] // .....................e..................................................|....................e................................................ + // sub v24.8h, v8.8h, v10.8h // ................e.......................................................|...............e..................................................... + // add v8.8h, v8.8h, v10.8h // .................e......................................................|................e.................................................... + // mul v10.8h, v24.8h, v0.8h // ..........................e.............................................|.........................e........................................... + // sqrdmulh v24.8h, v24.8h, v4.8h // .......................e................................................|......................e.............................................. + // mls v10.8h, v24.8h, v7.h[0] // ...............................e........................................|..............................e...................................... + // sub v24.8h, v9.8h, v11.8h // ........................e...............................................|.......................e............................................. + // add v9.8h, v9.8h, v11.8h // .........................e..............................................|........................e............................................ + // mul v11.8h, v24.8h, v0.8h // .............................e..........................................|............................e........................................ + // sqrdmulh v24.8h, v24.8h, v4.8h // ...........................e............................................|..........................e.......................................... + // mls v11.8h, v24.8h, v7.h[0] // ................................e.......................................|...............................e..................................... + // trn1 v25.4s, v8.4s, v9.4s // ............................e...........................................|...........................e......................................... + // trn2 v26.4s, v8.4s, v9.4s // ..............................e.........................................|.............................e....................................... + // trn1 v27.4s, v10.4s, v11.4s // ......................................e.................................|.....................................e............................... + // trn2 v28.4s, v10.4s, v11.4s // ....................................e...................................|...................................e................................. + // trn2 v10.2d, v25.2d, v27.2d // ...........................................e............................|..........................................e.......................... + // trn2 v11.2d, v26.2d, v28.2d // ............................................e...........................|...........................................e......................... + // trn1 v8.2d, v25.2d, v27.2d // ..........................................e.............................|.........................................e........................... + // trn1 v9.2d, v26.2d, v28.2d // .........................................e..............................|........................................e............................ + // ldr q0, [x3], #16 // ...........................................................e............|..........................................................e.......... + // sub v24.8h, v8.8h, v9.8h // ................................................e.......................|...............................................e..................... + // add v8.8h, v8.8h, v9.8h // ..............................................e.........................|.............................................e....................... + // mul v9.8h, v24.8h, v0.h[2] // ...*....................................................................|..*.................................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .......................................................................e|..................................................................... + // mls v9.8h, v24.8h, v7.h[0] // .........*..............................................................|........*............................................................ + // sub v24.8h, v10.8h, v11.8h // ...............................................................e........|..............................................................e...... + // add v10.8h, v10.8h, v11.8h // .................................................e......................|................................................e.................... + // mul v11.8h, v24.8h, v0.h[4] // .................................................................e......|................................................................e.... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ....................................................................e...|...................................................................e. + // mls v11.8h, v24.8h, v7.h[0] // ....*...................................................................|...*................................................................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // ..................................................e.....................|.................................................e................... + // srshr v25.8h, v25.8h, #11 // ........................................................e...............|.......................................................e............. + // mls v8.8h, v25.8h, v7.h[0] // .............................................................e..........|............................................................e........ + // sqdmulh v25.8h, v9.8h, v7.h[1] // .................................*......................................|................................*.................................... + // srshr v25.8h, v25.8h, #11 // .......................................*................................|......................................*.............................. + // mls v9.8h, v25.8h, v7.h[0] // .............................................*..........................|............................................*........................ + // sqdmulh v25.8h, v10.8h, v7.h[1] // .....................................................e..................|....................................................e................ + // srshr v25.8h, v25.8h, #11 // ............................................................e...........|...........................................................e......... + // mls v10.8h, v25.8h, v7.h[0] // ..............................................................e.........|.............................................................e....... + // sqdmulh v25.8h, v11.8h, v7.h[1] // ......................*.................................................|.....................*............................................... + // srshr v25.8h, v25.8h, #11 // ..................................*.....................................|.................................*................................... + // mls v11.8h, v25.8h, v7.h[0] // .....................................*..................................|....................................*................................ + // sub v24.8h, v8.8h, v10.8h // ...................................................................e....|..................................................................e.. + // add v8.8h, v8.8h, v10.8h // ..................................................................e.....|.................................................................e... + // mul v10.8h, v24.8h, v0.h[0] // ...................................*....................................|..................................*.................................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................................*...............................|.......................................*............................. + // mls v10.8h, v24.8h, v7.h[0] // ...............................................*........................|..............................................*...................... + // sub v24.8h, v9.8h, v11.8h // ...................................................*....................|..................................................*.................. + // add v9.8h, v9.8h, v11.8h // ....................................................*...................|...................................................*................. + // mul v11.8h, v24.8h, v0.h[0] // ..........................................................*.............|.........................................................*........... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................................................*................|......................................................*.............. + // mls v11.8h, v24.8h, v7.h[0] // ................................................................*.......|...............................................................*..... + // str q8, [x1], #(64) // ......................................................................e.|..................................................................... + // str q9, [x1, #(-64 + 16*1)] // .........................................................*..............|........................................................*............ + // str q10, [x1, #(-64 + 16*2)] // ......................................................*.................|.....................................................*............... + // str q11, [x1, #(-64 + 16*3)] // .....................................................................*..|....................................................................* sub count, count, #1 cbnz count, layer4567_start - // gap // ............. - // gap // ............. - mls v15.8H, v0.8H, v7.H[0] // .*........... - sub v25.8H, v20.8H, v21.8H // *............ - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - sqrdmulh v17.8H, v25.8H, v4.H[1] // ..*.......... - // gap // ............. - // gap // ............. - sub v26.8H, v18.8H, v15.8H // ....*........ - // gap // ............. - // gap // ............. - add v19.8H, v18.8H, v15.8H // .....*....... - mul v23.8H, v25.8H, v4.H[0] // ...*......... - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - sqrdmulh v8.8H, v26.8H, v4.H[1] // ......*...... - // gap // ............. - // gap // ............. - str q19, [x1, #-48] // .......*..... - // gap // ............. - // gap // ............. - // gap // ............. - mul v4.8H, v26.8H, v4.H[0] // ........*.... - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - mls v23.8H, v17.8H, v7.H[0] // ..........*.. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - mls v4.8H, v8.8H, v7.H[0] // .........*... - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - str q23, [x1, #-32] // ............* - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - str q4, [x1, #-16] // ...........*. - // gap // ............. - // gap // ............. + mul v19.8H, v30.8H, v13.H[2] // *................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + mls v12.8H, v11.8H, v7.H[0] // .*.................. + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + mls v19.8H, v23.8H, v7.H[0] // ..*................. + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + sqdmulh v4.8H, v12.8H, v7.H[1] // ...*................ + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + sqdmulh v28.8H, v19.8H, v7.H[1] // ....*............... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + sqrdmulh v8.8H, v26.8H, v13.H[1] // .........*.......... + // gap // .................... + // gap // .................... + srshr v21.8H, v4.8H, #11 // .....*.............. + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + srshr v22.8H, v28.8H, #11 // ........*........... + // gap // .................... + // gap // .................... + mls v12.8H, v21.8H, v7.H[0] // .......*............ + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + mls v19.8H, v22.8H, v7.H[0] // ..........*......... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + mul v23.8H, v26.8H, v13.H[0] // ......*............. + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + mls v23.8H, v8.8H, v7.H[0] // ...........*........ + // gap // .................... + // gap // .................... + sub v24.8H, v19.8H, v12.8H // ............*....... + // gap // .................... + // gap // .................... + add v25.8H, v19.8H, v12.8H // .............*...... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + sqrdmulh v26.8H, v24.8H, v13.H[1] // ...............*.... + // gap // .................... + // gap // .................... + str q25, [x1, #-48] // ................*... + // gap // .................... + // gap // .................... + mul v5.8H, v24.8H, v13.H[0] // .................*.. + str q23, [x1, #-32] // ..............*..... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + mls v5.8H, v26.8H, v7.H[0] // ..................*. + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + str q5, [x1, #-16] // ...................* + // gap // .................... + // gap // .................... // original source code - // sub v2.8H, v20.8H, v21.8H // .*........... - // mls v15.8H, v0.8H, v7.H[0] // *............ - // sqrdmulh v16.8H, v2.8H, v4.H[1] // ..*.......... - // mul v2.8H, v2.8H, v4.H[0] // .....*....... - // sub v23.8H, v18.8H, v15.8H // ...*......... - // add v0.8H, v18.8H, v15.8H // ....*........ - // sqrdmulh v10.8H, v23.8H, v4.H[1] // ......*...... - // str q0, [x1, #-48] // .......*..... - // mul v0.8H, v23.8H, v4.H[0] // ........*.... - // mls v0.8H, v10.8H, v7.H[0] // ..........*.. - // mls v2.8H, v16.8H, v7.H[0] // .........*... - // str q0, [x1, #-16] // ............* - // str q2, [x1, #-32] // ...........*. + // mul v22.8H, v30.8H, v13.H[2] // *................... + // mls v12.8H, v11.8H, v7.H[0] // .*.................. + // mls v22.8H, v23.8H, v7.H[0] // ..*................. + // sqdmulh v28.8H, v12.8H, v7.H[1] // ...*................ + // sqdmulh v3.8H, v22.8H, v7.H[1] // ....*............... + // srshr v27.8H, v28.8H, #11 // ......*............. + // mul v14.8H, v26.8H, v13.H[0] // ..........*......... + // mls v12.8H, v27.8H, v7.H[0] // ........*........... + // srshr v3.8H, v3.8H, #11 // .......*............ + // sqrdmulh v2.8H, v26.8H, v13.H[1] // .....*.............. + // mls v22.8H, v3.8H, v7.H[0] // .........*.......... + // mls v14.8H, v2.8H, v7.H[0] // ...........*........ + // sub v28.8H, v22.8H, v12.8H // ............*....... + // add v11.8H, v22.8H, v12.8H // .............*...... + // str q14, [x1, #-32] // .................*.. + // sqrdmulh v0.8H, v28.8H, v13.H[1] // ..............*..... + // str q11, [x1, #-48] // ...............*.... + // mul v10.8H, v28.8H, v13.H[0] // ................*... + // mls v10.8H, v0.8H, v7.H[0] // ..................*. + // str q10, [x1, #-16] // ...................* // --------------------------------------------------------------------- @@ -960,863 +1023,800 @@ layer4567_start: .p2align 2 - ldr q25, [x0, #192] // ...*........... - ldr q17, [x0, #128] // ....*.......... - // gap // ............... - ldr q3, [x0, #448] // *.............. - // gap // ............... - // gap // ............... - ldr q6, [x0, #384] // .*............. - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - sub v21.8H, v17.8H, v25.8H // .....*......... - // gap // ............... - // gap // ............... - ldr q22, [x0, #320] // ..*............ - // gap // ............... - // gap // ............... - ldr q4, [x0, #64] // ......*........ - sub v2.8H, v6.8H, v3.8H // .......*....... - // gap // ............... - sqrdmulh v23.8H, v21.8H, v1.H[1] // .........*..... - ldr q16, [x0, #0] // ........*...... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - mul v13.8H, v21.8H, v1.H[0] // ..........*.... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - mul v15.8H, v2.8H, v1.H[4] // ............*.. - add v28.8H, v16.8H, v4.8H // ..............* - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - mls v13.8H, v23.8H, v7.H[0] // .............*. - ldr q23, [x0, #256] // ...........*... - // gap // ............... + ldr q19, [x0, #448] // *......... + ldr q23, [x0, #384] // .*........ + // gap // .......... + ldr q2, [x0, #128] // ..*....... + ldr q21, [x0, #192] // .....*.... + // gap // .......... + ldr q24, [x0, #320] // .........* + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + sub v27.8H, v23.8H, v19.8H // ...*...... + // gap // .......... + // gap // .......... + add v15.8H, v23.8H, v19.8H // ....*..... + // gap // .......... + // gap // .......... + sub v10.8H, v2.8H, v21.8H // ......*... + // gap // .......... + // gap // .......... + sqrdmulh v13.8H, v27.8H, v1.H[5] // .......*.. + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + mul v27.8H, v27.8H, v1.H[4] // ........*. + // gap // .......... + // gap // .......... // original source code - // ldr q3, [x0, #448] // ..*............ - // ldr q6, [x0, #384] // ...*........... - // ldr q22, [x0, #320] // .....*......... - // ldr q25, [x0, #192] // *.............. - // ldr q17, [x0, #128] // .*............. - // sub v31.8H, v17.8H, v25.8H // ....*.......... - // ldr q4, [x0, #64] // ......*........ - // sub v2.8H, v6.8H, v3.8H // .......*....... - // ldr q16, [x0, #0] // .........*..... - // sqrdmulh v26.8H, v31.8H, v1.H[1] // ........*...... - // mul v13.8H, v31.8H, v1.H[0] // ..........*.... - // ldr q23, [x0, #256] // ..............* - // mul v15.8H, v2.8H, v1.H[4] // ...........*... - // mls v13.8H, v26.8H, v7.H[0] // .............*. - // add v28.8H, v16.8H, v4.8H // ............*.. + // ldr q26, [x0, #448] // *......... + // ldr q14, [x0, #384] // .*........ + // ldr q2, [x0, #128] // ..*....... + // sub v18.8H, v14.8H, v26.8H // .....*.... + // add v15.8H, v14.8H, v26.8H // ......*... + // ldr q21, [x0, #192] // ...*...... + // sub v10.8H, v2.8H, v21.8H // .......*.. + // sqrdmulh v13.8H, v18.8H, v1.H[5] // ........*. + // mul v27.8H, v18.8H, v1.H[4] // .........* + // ldr q24, [x0, #320] // ....*..... sub count, count, #1 layer123_start: - sub v16.8H, v16.8H, v4.8H // ........*..................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v4.8H, v23.8H, v22.8H // ..................*........................................................................... - // gap // .............................................................................................. - sqrdmulh v2.8H, v2.8H, v1.H[5] // ..........................*................................................................... - add v23.8H, v23.8H, v22.8H // ...................*.......................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v19.8H, v16.8H, v0.H[6] // ..........*................................................................................... - add v20.8H, v6.8H, v3.8H // ........................*..................................................................... - ldr q3, [x0, #464] // .......e...................................................................................... - add v17.8H, v17.8H, v25.8H // ..............*............................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v27.8H, v16.8H, v0.H[7] // ...........*.................................................................................. - ldr q6, [x0, #400] // ......e....................................................................................... - // gap // .............................................................................................. - sub v25.8H, v23.8H, v20.8H // ......................................*....................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - add v24.8H, v23.8H, v20.8H // .......................................*...................................................... - mul v16.8H, v4.8H, v1.H[2] // ....................*......................................................................... - // gap // .............................................................................................. - add v20.8H, v28.8H, v17.8H // .............................*................................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v21.8H, v4.8H, v1.H[3] // .....................*........................................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v15.8H, v2.8H, v7.H[0] // ...........................*.................................................................. - sub v2.8H, v28.8H, v17.8H // ............................*................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v19.8H, v27.8H, v7.H[0] // ............*................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v16.8H, v21.8H, v7.H[0] // ......................*....................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v4.8H, v2.8H, v0.H[2] // ..............................*............................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v26.8H, v19.8H, v13.8H // .................................*............................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - add v9.8H, v19.8H, v13.8H // ..................................*........................................................... - mul v18.8H, v25.8H, v0.H[4] // ........................................*..................................................... - // gap // .............................................................................................. - sub v10.8H, v16.8H, v15.8H // ...........................................*.................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v31.8H, v25.8H, v0.H[5] // .........................................*.................................................... - add v16.8H, v16.8H, v15.8H // ............................................*................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v2.8H, v2.8H, v0.H[3] // ...............................*.............................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v25.8H, v9.8H, v16.8H // ...........................................................*.................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v16.8H, v9.8H, v16.8H // ............................................................*................................. - sqdmulh v21.8H, v20.8H, v7.H[1] // ................................................*............................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqdmulh v23.8H, v24.8H, v7.H[1] // ...................................................*.......................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v4.8H, v2.8H, v7.H[0] // ................................*............................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - srshr v2.8H, v21.8H, #11 // .................................................*............................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v17.8H, v26.8H, v0.H[2] // ...................................*.......................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - srshr v23.8H, v23.8H, #11 // ....................................................*......................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v20.8H, v2.8H, v7.H[0] // ..................................................*........................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v24.8H, v23.8H, v7.H[0] // .....................................................*........................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v22.8H, v26.8H, v0.H[3] // ....................................*......................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v18.8H, v31.8H, v7.H[0] // ..........................................*................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v2.8H, v20.8H, v24.8H // ......................................................*....................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - add v5.8H, v20.8H, v24.8H // .......................................................*...................................... - mul v21.8H, v10.8H, v0.H[4] // .............................................*................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v17.8H, v22.8H, v7.H[0] // .....................................*........................................................ - ldr q22, [x0, #336] // .....e........................................................................................ - // gap // .............................................................................................. - sub v20.8H, v4.8H, v18.8H // ................................................................*............................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v31.8H, v10.8H, v0.H[5] // ..............................................*............................................... - add v26.8H, v4.8H, v18.8H // .................................................................*............................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v11.8H, v2.8H, v0.H[0] // ........................................................*..................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v2.8H, v2.8H, v0.H[1] // .........................................................*.................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v21.8H, v31.8H, v7.H[0] // ...............................................*.............................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v4.8H, v25.8H, v0.H[1] // ..............................................................*............................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v25.8H, v25.8H, v0.H[0] // .............................................................*................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v11.8H, v2.8H, v7.H[0] // ..........................................................*................................... - sub v2.8H, v17.8H, v21.8H // .....................................................................*........................ - // gap // .............................................................................................. - add v21.8H, v17.8H, v21.8H // ......................................................................*....................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v25.8H, v4.8H, v7.H[0] // ...............................................................*.............................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v17.8H, v20.8H, v0.H[0] // ..................................................................*........................... - // gap // .............................................................................................. - // gap // .............................................................................................. - str q11, [x0, #256] // ..........................................................................*................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v10.8H, v20.8H, v0.H[1] // ...................................................................*.......................... - // gap // .............................................................................................. - // gap // .............................................................................................. - str q25, [x0, #320] // ...........................................................................*.................. - // gap // .............................................................................................. - ldr q25, [x0, #208] // ...e.......................................................................................... - mul v4.8H, v2.8H, v0.H[0] // .......................................................................*...................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v2.8H, v2.8H, v0.H[1] // ........................................................................*..................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v17.8H, v10.8H, v7.H[0] // ....................................................................*......................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v20.8H, v5.8H, v29.8H // ..............................................................................*............... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v4.8H, v2.8H, v7.H[0] // .........................................................................*.................... - // gap // .............................................................................................. - // gap // .............................................................................................. - str q17, [x0, #384] // ............................................................................*................. - ldr q17, [x0, #144] // ..e........................................................................................... - // gap // .............................................................................................. - sqrdmulh v28.8H, v5.8H, v30.8H // ...............................................................................*.............. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v2.8H, v16.8H, v29.8H // .................................................................................*............ - // gap // .............................................................................................. - // gap // .............................................................................................. - str q4, [x0, #448] // .............................................................................*................ - sub v31.8H, v17.8H, v25.8H // .............e................................................................................ - ldr q4, [x0, #80] // .e............................................................................................ - sqrdmulh v16.8H, v16.8H, v30.8H // ..................................................................................*........... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v20.8H, v28.8H, v7.H[0] // ................................................................................*............. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v23.8H, v26.8H, v29.8H // ....................................................................................*......... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v2.8H, v16.8H, v7.H[0] // ...................................................................................*.......... - // gap // .............................................................................................. - // gap // .............................................................................................. - str q20, [x0], #(16) // ..........................................................................................*... - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v26.8H, v26.8H, v30.8H // .....................................................................................*........ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v16.8H, v21.8H, v30.8H // ........................................................................................*..... - // gap // .............................................................................................. - // gap // .............................................................................................. - str q2, [x0, #48] // ...........................................................................................*.. - sub v2.8H, v6.8H, v3.8H // .......................e...................................................................... - // gap // .............................................................................................. - mul v21.8H, v21.8H, v29.8H // .......................................................................................*...... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v23.8H, v26.8H, v7.H[0] // ......................................................................................*....... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v21.8H, v16.8H, v7.H[0] // .........................................................................................*.... - ldr q16, [x0, #0] // e............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v26.8H, v31.8H, v1.H[1] // ................e............................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v13.8H, v31.8H, v1.H[0] // ...............e.............................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - str q21, [x0, #176] // .............................................................................................* - // gap // .............................................................................................. - // gap // .............................................................................................. - str q23, [x0, #112] // ............................................................................................*. - ldr q23, [x0, #256] // ....e......................................................................................... - mul v15.8H, v2.8H, v1.H[4] // .........................e.................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v13.8H, v26.8H, v7.H[0] // .................e............................................................................ - // gap // .............................................................................................. - add v28.8H, v16.8H, v4.8H // .........e.................................................................................... + ldr q28, [x0, #64] // .*...................................................................................... + sqrdmulh v9.8H, v10.8H, v1.H[1] // ................*....................................................................... + ldr q23, [x0, #0] // *....................................................................................... + ldr q26, [x0, #464] // .......e................................................................................ + ldr q14, [x0, #400] // ......e................................................................................. + // gap // ........................................................................................ + mul v3.8H, v10.8H, v1.H[0] // ...............*........................................................................ + add v8.8H, v2.8H, v21.8H // ..............*......................................................................... + ldr q22, [x0, #256] // ....*................................................................................... + ldr q2, [x0, #144] // ..e..................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + add v25.8H, v23.8H, v28.8H // .........*.............................................................................. + mls v27.8H, v13.8H, v7.H[0] // ...........................*............................................................ + // gap // ........................................................................................ + sub v20.8H, v23.8H, v28.8H // ........*............................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v3.8H, v9.8H, v7.H[0] // .................*...................................................................... + add v23.8H, v22.8H, v24.8H // ...................*.................................................................... + // gap // ........................................................................................ + sub v28.8H, v22.8H, v24.8H // ..................*..................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v17.8H, v20.8H, v0.H[7] // ...........*............................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v19.8H, v23.8H, v15.8H // ......................................*................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v12.8H, v25.8H, v8.8H // ............................*........................................................... + mul v21.8H, v20.8H, v0.H[6] // ..........*............................................................................. + // gap // ........................................................................................ + sub v18.8H, v14.8H, v26.8H // .......................e................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v31.8H, v23.8H, v15.8H // .......................................*................................................ + // gap // ........................................................................................ + sqrdmulh v23.8H, v28.8H, v1.H[3] // .....................*.................................................................. + add v15.8H, v14.8H, v26.8H // ........................e............................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v21.8H, v17.8H, v7.H[0] // ............*........................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v14.8H, v28.8H, v1.H[2] // ....................*................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v14.8H, v23.8H, v7.H[0] // ......................*................................................................. + add v23.8H, v25.8H, v8.8H // .............................*.......................................................... + // gap // ........................................................................................ + sub v28.8H, v21.8H, v3.8H // .................................*...................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + add v3.8H, v21.8H, v3.8H // ..................................*..................................................... + mul v21.8H, v19.8H, v0.H[4] // ........................................*............................................... + // gap // ........................................................................................ + add v24.8H, v23.8H, v31.8H // .................................................*...................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v13.8H, v19.8H, v0.H[5] // .........................................*.............................................. + sub v23.8H, v23.8H, v31.8H // ................................................*....................................... + // gap // ........................................................................................ + add v22.8H, v14.8H, v27.8H // ............................................*........................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v6.8H, v12.8H, v0.H[2] // ..............................*......................................................... + sub v17.8H, v14.8H, v27.8H // ...........................................*............................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v20.8H, v23.8H, v0.H[0] // ..................................................*..................................... + sub v10.8H, v3.8H, v22.8H // .....................................................*.................................. + // gap // ........................................................................................ + add v9.8H, v3.8H, v22.8H // ......................................................*................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v14.8H, v17.8H, v0.H[5] // ..............................................*......................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v19.8H, v23.8H, v0.H[1] // ...................................................*.................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v26.8H, v24.8H, v29.8H // ........................................................................*............... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v31.8H, v28.8H, v0.H[3] // ....................................*................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v20.8H, v19.8H, v7.H[0] // ....................................................*................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v4.8H, v28.8H, v0.H[2] // ...................................*.................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v25.8H, v12.8H, v0.H[3] // ...............................*........................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q20, [x0, #256] // ....................................................................*................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v21.8H, v13.8H, v7.H[0] // ..........................................*............................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v22.8H, v17.8H, v0.H[4] // .............................................*.......................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v6.8H, v25.8H, v7.H[0] // ................................*....................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v19.8H, v24.8H, v30.8H // .........................................................................*.............. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v11.8H, v10.8H, v0.H[0] // .......................................................*................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v8.8H, v6.8H, v21.8H // ..........................................................*............................. + // gap // ........................................................................................ + // gap // ........................................................................................ + add v25.8H, v6.8H, v21.8H // ...........................................................*............................ + ldr q21, [x0, #208] // ...e.................................................................................... + mul v24.8H, v9.8H, v29.8H // ...........................................................................*............ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v3.8H, v8.8H, v0.H[1] // .............................................................*.......................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v20.8H, v8.8H, v0.H[0] // ............................................................*........................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v4.8H, v31.8H, v7.H[0] // .....................................*.................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v22.8H, v14.8H, v7.H[0] // ...............................................*........................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v26.8H, v19.8H, v7.H[0] // ..........................................................................*............. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v19.8H, v10.8H, v0.H[1] // ........................................................*............................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v23.8H, v4.8H, v22.8H // ...............................................................*........................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v10.8H, v2.8H, v21.8H // .............e.......................................................................... + mls v20.8H, v3.8H, v7.H[0] // ..............................................................*......................... + // gap // ........................................................................................ + add v14.8H, v4.8H, v22.8H // ................................................................*....................... + str q26, [x0], #(16) // ....................................................................................*... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v28.8H, v25.8H, v29.8H // ..............................................................................*......... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v11.8H, v19.8H, v7.H[0] // .........................................................*.............................. + // gap // ........................................................................................ + // gap // ........................................................................................ + str q20, [x0, #368] // ......................................................................*................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v12.8H, v25.8H, v30.8H // ...............................................................................*........ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v22.8H, v14.8H, v29.8H // .................................................................................*...... + str q11, [x0, #304] // .....................................................................*.................. + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v19.8H, v23.8H, v0.H[0] // .................................................................*...................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v6.8H, v23.8H, v0.H[1] // ..................................................................*..................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v11.8H, v9.8H, v30.8H // ............................................................................*........... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v8.8H, v14.8H, v30.8H // ..................................................................................*..... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v19.8H, v6.8H, v7.H[0] // ...................................................................*.................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v24.8H, v11.8H, v7.H[0] // .............................................................................*.......... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v28.8H, v12.8H, v7.H[0] // ................................................................................*....... + // gap // ........................................................................................ + // gap // ........................................................................................ + str q19, [x0, #432] // .......................................................................*................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v22.8H, v8.8H, v7.H[0] // ...................................................................................*.... + // gap // ........................................................................................ + // gap // ........................................................................................ + str q24, [x0, #48] // .....................................................................................*.. + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v13.8H, v18.8H, v1.H[5] // ..........................e............................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + str q28, [x0, #112] // ......................................................................................*. + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v27.8H, v18.8H, v1.H[4] // .........................e.............................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + str q22, [x0, #176] // .......................................................................................* + // gap // ........................................................................................ + ldr q24, [x0, #320] // .....e.................................................................................. // original source code - // ldr q8, [x0, #0] // ...............................................................................e........|....................................................................................e.... - // ldr q9, [x0, #(1*(512/8))] // ..................................................................e.....................|.......................................................................e................. - // ldr q10, [x0, #(2*(512/8))] // .............................................................e..........................|..................................................................e...................... - // ldr q11, [x0, #(3*(512/8))] // ......................................................e.................................|...........................................................e............................. - // ldr q12, [x0, #(4*(512/8))] // ....................................................................................e...|......................................................................................... - // ldr q13, [x0, #(5*(512/8))] // .....................................e..................................................|..........................................e.............................................. - // ldr q14, [x0, #(6*(512/8))] // ...e....................................................................................|........e................................................................................ - // ldr q15, [x0, #(7*(512/8))] // e.......................................................................................|.....e................................................................................... - // sub v24.8h, v8.8h, v9.8h // ........................................................................................*......................................................................................... - // add v8.8h, v8.8h, v9.8h // .......................................................................................e|......................................................................................... - // mul v9.8h, v24.8h, v0.h[6] // ........................................................................................|...*..................................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[7] // ..*.....................................................................................|.......*................................................................................. - // mls v9.8h, v24.8h, v7.h[0] // ...........*............................................................................|................*........................................................................ - // sub v24.8h, v10.8h, v11.8h // .................................................................e......................|......................................................................e.................. - // add v10.8h, v10.8h, v11.8h // .*......................................................................................|......*.................................................................................. - // mul v11.8h, v24.8h, v1.h[0] // .................................................................................e......|......................................................................................e.. - // sqrdmulh v24.8h, v24.8h, v1.h[1] // ................................................................................e.......|.....................................................................................e... - // mls v11.8h, v24.8h, v7.h[0] // ......................................................................................e.|......................................................................................... - // sub v24.8h, v12.8h, v13.8h // ........................................................................................|*........................................................................................ - // add v12.8h, v12.8h, v13.8h // ........................................................................................|..*...................................................................................... - // mul v13.8h, v24.8h, v1.h[2] // ......*.................................................................................|...........*............................................................................. - // sqrdmulh v24.8h, v24.8h, v1.h[3] // ........*...............................................................................|.............*........................................................................... - // mls v13.8h, v24.8h, v7.h[0] // ............*...........................................................................|.................*....................................................................... - // sub v24.8h, v14.8h, v15.8h // ...........................................................................e............|................................................................................e........ - // add v14.8h, v14.8h, v15.8h // ........................................................................................|....*.................................................................................... - // mul v15.8h, v24.8h, v1.h[4] // .....................................................................................e..|......................................................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[5] // ........................................................................................|.*....................................................................................... - // mls v15.8h, v24.8h, v7.h[0] // .........*..............................................................................|..............*.......................................................................... - // sub v24.8h, v8.8h, v10.8h // ..........*.............................................................................|...............*......................................................................... - // add v8.8h, v8.8h, v10.8h // .......*................................................................................|............*............................................................................ - // mul v10.8h, v24.8h, v0.h[2] // .............*..........................................................................|..................*...................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ....................*...................................................................|.........................*............................................................... - // mls v10.8h, v24.8h, v7.h[0] // .........................*..............................................................|..............................*.......................................................... - // sub v24.8h, v9.8h, v11.8h // ..............*.........................................................................|...................*..................................................................... - // add v9.8h, v9.8h, v11.8h // ...............*........................................................................|....................*.................................................................... - // mul v11.8h, v24.8h, v0.h[2] // ...........................*............................................................|................................*........................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...............................*........................................................|....................................*.................................................... - // mls v11.8h, v24.8h, v7.h[0] // ....................................*...................................................|.........................................*............................................... - // sub v24.8h, v12.8h, v14.8h // ....*...................................................................................|.........*............................................................................... - // add v12.8h, v12.8h, v14.8h // .....*..................................................................................|..........*.............................................................................. - // mul v14.8h, v24.8h, v0.h[4] // ................*.......................................................................|.....................*................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..................*.....................................................................|.......................*................................................................. - // mls v14.8h, v24.8h, v7.h[0] // ................................*.......................................................|.....................................*................................................... - // sub v24.8h, v13.8h, v15.8h // .................*......................................................................|......................*.................................................................. - // add v13.8h, v13.8h, v15.8h // ...................*....................................................................|........................*................................................................ - // mul v15.8h, v24.8h, v0.h[4] // ...................................*....................................................|........................................*................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[5] // .......................................*................................................|............................................*............................................ - // mls v15.8h, v24.8h, v7.h[0] // ...........................................*............................................|................................................*........................................ - // sqdmulh v25.8h, v8.8h, v7.h[1] // .......................*................................................................|............................*............................................................ - // srshr v25.8h, v25.8h, #11 // ..........................*.............................................................|...............................*......................................................... - // mls v8.8h, v25.8h, v7.h[0] // .............................*..........................................................|..................................*...................................................... - // sqdmulh v25.8h, v12.8h, v7.h[1] // ........................*...............................................................|.............................*........................................................... - // srshr v25.8h, v25.8h, #11 // ............................*...........................................................|.................................*....................................................... - // mls v12.8h, v25.8h, v7.h[0] // ..............................*.........................................................|...................................*..................................................... - // sub v24.8h, v8.8h, v12.8h // .................................*......................................................|......................................*.................................................. - // add v8.8h, v8.8h, v12.8h // ..................................*.....................................................|.......................................*................................................. - // mul v12.8h, v24.8h, v0.h[0] // .........................................*..............................................|..............................................*.......................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........................................*.............................................|...............................................*......................................... - // mls v12.8h, v24.8h, v7.h[0] // ..............................................*.........................................|...................................................*..................................... - // sub v24.8h, v9.8h, v13.8h // .....................*..................................................................|..........................*.............................................................. - // add v9.8h, v9.8h, v13.8h // ......................*.................................................................|...........................*............................................................. - // mul v13.8h, v24.8h, v0.h[0] // .............................................*..........................................|..................................................*...................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ............................................*...........................................|.................................................*....................................... - // mls v13.8h, v24.8h, v7.h[0] // .................................................*......................................|......................................................*.................................. - // sub v24.8h, v10.8h, v14.8h // ......................................*.................................................|...........................................*............................................. - // add v10.8h, v10.8h, v14.8h // ........................................*...............................................|.............................................*........................................... - // mul v14.8h, v24.8h, v0.h[0] // ..................................................*.....................................|.......................................................*................................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ....................................................*...................................|.........................................................*............................... - // mls v14.8h, v24.8h, v7.h[0] // .........................................................*..............................|..............................................................*.......................... - // sub v24.8h, v11.8h, v15.8h // ...............................................*........................................|....................................................*.................................... - // add v11.8h, v11.8h, v15.8h // ................................................*.......................................|.....................................................*................................... - // mul v15.8h, v24.8h, v0.h[0] // .......................................................*................................|............................................................*............................ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................................................*...............................|.............................................................*........................... - // mls v15.8h, v24.8h, v7.h[0] // ...........................................................*............................|................................................................*........................ - // str q12, [x0, #(4*(512/8))] // ...................................................*....................................|........................................................*................................ - // str q13, [x0, #(5*(512/8))] // .....................................................*..................................|..........................................................*.............................. - // str q14, [x0, #(6*(512/8))] // ............................................................*...........................|.................................................................*....................... - // str q15, [x0, #(7*(512/8))] // ................................................................*.......................|.....................................................................*................... - // mul v12.8h, v8.8h, v29.8h // ..........................................................*.............................|...............................................................*......................... - // sqrdmulh v8.8h, v8.8h, v30.8h // ..............................................................*.........................|...................................................................*..................... - // mls v12.8h, v8.8h, v7.h[0] // ....................................................................*...................|.........................................................................*............... - // mul v13.8h, v9.8h, v29.8h // ...............................................................*........................|....................................................................*.................... - // sqrdmulh v9.8h, v9.8h, v30.8h // ...................................................................*....................|........................................................................*................ - // mls v13.8h, v9.8h, v7.h[0] // ......................................................................*.................|...........................................................................*............. - // mul v14.8h, v10.8h, v29.8h // .....................................................................*..................|..........................................................................*.............. - // sqrdmulh v10.8h, v10.8h, v30.8h // ........................................................................*...............|.............................................................................*........... - // mls v14.8h, v10.8h, v7.h[0] // .............................................................................*..........|..................................................................................*...... - // mul v15.8h, v11.8h, v29.8h // ............................................................................*...........|.................................................................................*....... - // sqrdmulh v11.8h, v11.8h, v30.8h // .........................................................................*..............|..............................................................................*.......... - // mls v15.8h, v11.8h, v7.h[0] // ..............................................................................*.........|...................................................................................*..... - // str q12, [x0], #(16) // .......................................................................*................|............................................................................*............ - // str q13, [x0, #(-16 + 1*(512/8))] // ..........................................................................*.............|...............................................................................*......... - // str q14, [x0, #(-16 + 2*(512/8))] // ...................................................................................*....|........................................................................................* - // str q15, [x0, #(-16 + 3*(512/8))] // ..................................................................................*.....|.......................................................................................*. + // ldr q8, [x0, #0] // .....................................................................................|.*.................................................................................... + // ldr q9, [x0, #(1*(512/8))] // .....................................................................................*...................................................................................... + // ldr q10, [x0, #(2*(512/8))] // .....e...............................................................................|.......e.............................................................................. + // ldr q11, [x0, #(3*(512/8))] // ...................................................e.................................|.....................................................e................................ + // ldr q12, [x0, #(4*(512/8))] // ....*................................................................................|......*............................................................................... + // ldr q13, [x0, #(5*(512/8))] // ....................................................................................e|...................................................................................... + // ldr q14, [x0, #(6*(512/8))] // .e...................................................................................|...e.................................................................................. + // ldr q15, [x0, #(7*(512/8))] // e....................................................................................|..e................................................................................... + // sub v24.8h, v8.8h, v9.8h // ........*............................................................................|..........*........................................................................... + // add v8.8h, v8.8h, v9.8h // ......*..............................................................................|........*............................................................................. + // mul v9.8h, v24.8h, v0.h[6] // ...............*.....................................................................|.................*.................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // ............*........................................................................|..............*....................................................................... + // mls v9.8h, v24.8h, v7.h[0] // ....................*................................................................|......................*............................................................... + // sub v24.8h, v10.8h, v11.8h // ............................................................e........................|..............................................................e....................... + // add v10.8h, v10.8h, v11.8h // ...*.................................................................................|.....*................................................................................ + // mul v11.8h, v24.8h, v1.h[0] // ..*..................................................................................|....*................................................................................. + // sqrdmulh v24.8h, v24.8h, v1.h[1] // .....................................................................................|*..................................................................................... + // mls v11.8h, v24.8h, v7.h[0] // .........*...........................................................................|...........*.......................................................................... + // sub v24.8h, v12.8h, v13.8h // ...........*.........................................................................|.............*........................................................................ + // add v12.8h, v12.8h, v13.8h // ..........*..........................................................................|............*......................................................................... + // mul v13.8h, v24.8h, v1.h[2] // .....................*...............................................................|.......................*.............................................................. + // sqrdmulh v24.8h, v24.8h, v1.h[3] // ..................*..................................................................|....................*................................................................. + // mls v13.8h, v24.8h, v7.h[0] // ......................*..............................................................|........................*............................................................. + // sub v24.8h, v14.8h, v15.8h // ................e....................................................................|..................e................................................................... + // add v14.8h, v14.8h, v15.8h // ...................e.................................................................|.....................e................................................................ + // mul v15.8h, v24.8h, v1.h[4] // ..................................................................................e..|....................................................................................e. + // sqrdmulh v24.8h, v24.8h, v1.h[5] // ................................................................................e....|..................................................................................e... + // mls v15.8h, v24.8h, v7.h[0] // .......*.............................................................................|.........*............................................................................ + // sub v24.8h, v8.8h, v10.8h // ..............*......................................................................|................*..................................................................... + // add v8.8h, v8.8h, v10.8h // .......................*.............................................................|.........................*............................................................ + // mul v10.8h, v24.8h, v0.h[2] // ...............................*.....................................................|.................................*.................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..........................................*..........................................|............................................*......................................... + // mls v10.8h, v24.8h, v7.h[0] // ..............................................*......................................|................................................*..................................... + // sub v24.8h, v9.8h, v11.8h // ........................*............................................................|..........................*........................................................... + // add v9.8h, v9.8h, v11.8h // .........................*...........................................................|...........................*.......................................................... + // mul v11.8h, v24.8h, v0.h[2] // .........................................*...........................................|...........................................*.......................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .......................................*.............................................|.........................................*............................................ + // mls v11.8h, v24.8h, v7.h[0] // .......................................................*.............................|.........................................................*............................ + // sub v24.8h, v12.8h, v14.8h // .............*.......................................................................|...............*...................................................................... + // add v12.8h, v12.8h, v14.8h // .................*...................................................................|...................*.................................................................. + // mul v14.8h, v24.8h, v0.h[4] // ..........................*..........................................................|............................*......................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ............................*........................................................|..............................*....................................................... + // mls v14.8h, v24.8h, v7.h[0] // ............................................*........................................|..............................................*....................................... + // sub v24.8h, v13.8h, v15.8h // ................................*....................................................|..................................*................................................... + // add v13.8h, v13.8h, v15.8h // ..............................*......................................................|................................*..................................................... + // mul v15.8h, v24.8h, v0.h[4] // .............................................*.......................................|...............................................*...................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ....................................*................................................|......................................*............................................... + // mls v15.8h, v24.8h, v7.h[0] // ........................................................*............................|..........................................................*........................... + // sub v24.8h, v8.8h, v12.8h // .............................*.......................................................|...............................*...................................................... + // add v8.8h, v8.8h, v12.8h // ...........................*.........................................................|.............................*........................................................ + // mul v12.8h, v24.8h, v0.h[0] // .................................*...................................................|...................................*.................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .....................................*...............................................|.......................................*.............................................. + // mls v12.8h, v24.8h, v7.h[0] // ........................................*............................................|..........................................*........................................... + // sub v24.8h, v9.8h, v13.8h // ..................................*..................................................|....................................*................................................. + // add v9.8h, v9.8h, v13.8h // ...................................*.................................................|.....................................*................................................ + // mul v13.8h, v24.8h, v0.h[0] // ................................................*....................................|..................................................*................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........................................................*..........................|............................................................*......................... + // mls v13.8h, v24.8h, v7.h[0] // .................................................................*...................|...................................................................*.................. + // sub v24.8h, v10.8h, v14.8h // .................................................*...................................|...................................................*.................................. + // add v10.8h, v10.8h, v14.8h // ..................................................*..................................|....................................................*................................. + // mul v14.8h, v24.8h, v0.h[0] // ......................................................*..............................|........................................................*............................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .....................................................*...............................|.......................................................*.............................. + // mls v14.8h, v24.8h, v7.h[0] // .............................................................*.......................|...............................................................*...................... + // sub v24.8h, v11.8h, v15.8h // ...........................................................*.........................|.............................................................*........................ + // add v11.8h, v11.8h, v15.8h // ..............................................................*......................|................................................................*..................... + // mul v15.8h, v24.8h, v0.h[0] // ......................................................................*..............|........................................................................*............. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................................................................*.............|.........................................................................*............ + // mls v15.8h, v24.8h, v7.h[0] // ..........................................................................*..........|............................................................................*......... + // str q12, [x0, #(4*(512/8))] // ...........................................*.........................................|.............................................*........................................ + // str q13, [x0, #(5*(512/8))] // .....................................................................*...............|.......................................................................*.............. + // str q14, [x0, #(6*(512/8))] // ..................................................................*..................|....................................................................*................. + // str q15, [x0, #(7*(512/8))] // .............................................................................*.......|...............................................................................*...... + // mul v12.8h, v8.8h, v29.8h // ......................................*..............................................|........................................*............................................. + // sqrdmulh v8.8h, v8.8h, v30.8h // ...............................................*.....................................|.................................................*.................................... + // mls v12.8h, v8.8h, v7.h[0] // .........................................................*...........................|...........................................................*.......................... + // mul v13.8h, v9.8h, v29.8h // ....................................................*................................|......................................................*............................... + // sqrdmulh v9.8h, v9.8h, v30.8h // ........................................................................*............|..........................................................................*........... + // mls v13.8h, v9.8h, v7.h[0] // ...........................................................................*.........|.............................................................................*........ + // mul v14.8h, v10.8h, v29.8h // ................................................................*....................|..................................................................*................... + // sqrdmulh v10.8h, v10.8h, v30.8h // ...................................................................*.................|.....................................................................*................ + // mls v14.8h, v10.8h, v7.h[0] // ............................................................................*........|..............................................................................*....... + // mul v15.8h, v11.8h, v29.8h // ....................................................................*................|......................................................................*............... + // sqrdmulh v11.8h, v11.8h, v30.8h // .........................................................................*...........|...........................................................................*.......... + // mls v15.8h, v11.8h, v7.h[0] // ..............................................................................*......|................................................................................*..... + // str q12, [x0], #(16) // ...............................................................*.....................|.................................................................*.................... + // str q13, [x0, #(-16 + 1*(512/8))] // ...............................................................................*.....|.................................................................................*.... + // str q14, [x0, #(-16 + 2*(512/8))] // .................................................................................*...|...................................................................................*.. + // str q15, [x0, #(-16 + 3*(512/8))] // ...................................................................................*.|.....................................................................................* sub count, count, #1 cbnz count, layer123_start - sub v16.8H, v16.8H, v4.8H // *.............................................................................. - sqrdmulh v2.8H, v2.8H, v1.H[5] // ..*............................................................................ - // gap // ............................................................................... - add v26.8H, v17.8H, v25.8H // ......*........................................................................ - // gap // ............................................................................... - // gap // ............................................................................... - add v21.8H, v6.8H, v3.8H // .....*......................................................................... - sub v27.8H, v23.8H, v22.8H // .*............................................................................. - // gap // ............................................................................... - mul v25.8H, v16.8H, v0.H[6] // ....*.......................................................................... - add v23.8H, v23.8H, v22.8H // ...*........................................................................... - // gap // ............................................................................... - sub v31.8H, v28.8H, v26.8H // ..............*................................................................ - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v17.8H, v16.8H, v0.H[7] // .......*....................................................................... - add v26.8H, v28.8H, v26.8H // ...........*................................................................... - // gap // ............................................................................... - sub v5.8H, v23.8H, v21.8H // ........*...................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - add v23.8H, v23.8H, v21.8H // .........*..................................................................... - mls v15.8H, v2.8H, v7.H[0] // .............*................................................................. - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - sqdmulh v11.8H, v26.8H, v7.H[1] // ...........................*................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v20.8H, v27.8H, v1.H[3] // ............*.................................................................. - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - sqdmulh v3.8H, v23.8H, v7.H[1] // ............................*.................................................. - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mul v21.8H, v27.8H, v1.H[2] // ..........*.................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v21.8H, v20.8H, v7.H[0] // ................*.............................................................. - // gap // ............................................................................... - // gap // ............................................................................... - srshr v20.8H, v3.8H, #11 // ................................*.............................................. - // gap // ............................................................................... - // gap // ............................................................................... - mls v25.8H, v17.8H, v7.H[0] // ...............*............................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - srshr v11.8H, v11.8H, #11 // ..............................*................................................ - sqrdmulh v4.8H, v31.8H, v0.H[3] // ........................*...................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v26.8H, v11.8H, v7.H[0] // .................................*............................................. - // gap // ............................................................................... - // gap // ............................................................................... - add v16.8H, v25.8H, v13.8H // ...................*........................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mul v17.8H, v31.8H, v0.H[2] // .................*............................................................. - sub v2.8H, v25.8H, v13.8H // ..................*............................................................ - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v23.8H, v20.8H, v7.H[0] // ..................................*............................................ - sub v20.8H, v21.8H, v15.8H // .....................*......................................................... - // gap // ............................................................................... - add v21.8H, v21.8H, v15.8H // .......................*....................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v17.8H, v4.8H, v7.H[0] // .............................*................................................. - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mul v4.8H, v2.8H, v0.H[2] // ...............................*............................................... - sub v11.8H, v16.8H, v21.8H // .........................*..................................................... - // gap // ............................................................................... - add v16.8H, v16.8H, v21.8H // ..........................*.................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mul v13.8H, v5.8H, v0.H[4] // ....................*.......................................................... - sub v21.8H, v26.8H, v23.8H // .....................................*......................................... - // gap // ............................................................................... - add v23.8H, v26.8H, v23.8H // ......................................*........................................ - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v25.8H, v5.8H, v0.H[5] // ......................*........................................................ - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v2.8H, v2.8H, v0.H[3] // ...................................*........................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mul v26.8H, v20.8H, v0.H[4] // .......................................*....................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v20.8H, v20.8H, v0.H[5] // ..........................................*.................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v4.8H, v2.8H, v7.H[0] // ........................................*...................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v13.8H, v25.8H, v7.H[0] // ....................................*.......................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v26.8H, v20.8H, v7.H[0] // ..............................................*................................ - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v20.8H, v11.8H, v0.H[1] // ...............................................*............................... - // gap // ............................................................................... - // gap // ............................................................................... - sub v2.8H, v17.8H, v13.8H // .........................................*..................................... - // gap // ............................................................................... - // gap // ............................................................................... - mul v25.8H, v21.8H, v0.H[0] // ............................................*.................................. - add v17.8H, v17.8H, v13.8H // ...........................................*................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v21.8H, v21.8H, v0.H[1] // .............................................*................................. - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mul v11.8H, v11.8H, v0.H[0] // ................................................*.............................. - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v11.8H, v20.8H, v7.H[0] // ....................................................*.......................... - // gap // ............................................................................... - sub v13.8H, v4.8H, v26.8H // ..................................................*............................ - add v26.8H, v4.8H, v26.8H // ...................................................*........................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v25.8H, v21.8H, v7.H[0] // .................................................*............................. - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v4.8H, v13.8H, v0.H[1] // ..........................................................*.................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mul v21.8H, v2.8H, v0.H[0] // .....................................................*......................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v2.8H, v2.8H, v0.H[1] // .......................................................*....................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mul v20.8H, v13.8H, v0.H[0] // .........................................................*..................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v20.8H, v4.8H, v7.H[0] // .............................................................*................. - str q25, [x0, #256] // ......................................................*........................ - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - str q11, [x0, #320] // ........................................................*...................... - mls v21.8H, v2.8H, v7.H[0] // ...........................................................*................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mul v2.8H, v23.8H, v29.8H // ............................................................*.................. - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v23.8H, v23.8H, v30.8H // ...............................................................*............... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - str q21, [x0, #384] // ..............................................................*................ - mul v21.8H, v16.8H, v29.8H // ................................................................*.............. - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v16.8H, v16.8H, v30.8H // ..................................................................*............ - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v2.8H, v23.8H, v7.H[0] // ...................................................................*........... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mul v23.8H, v17.8H, v29.8H // ....................................................................*.......... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - str q20, [x0, #448] // .................................................................*............. - mls v21.8H, v16.8H, v7.H[0] // .....................................................................*......... - // gap // ............................................................................... - str q2, [x0], #(16) // ......................................................................*........ - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v2.8H, v17.8H, v30.8H // .......................................................................*....... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v16.8H, v26.8H, v30.8H // ........................................................................*...... - str q21, [x0, #48] // .........................................................................*..... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mul v21.8H, v26.8H, v29.8H // ..........................................................................*.... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v23.8H, v2.8H, v7.H[0] // ...........................................................................*... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v21.8H, v16.8H, v7.H[0] // ............................................................................*.. - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - str q23, [x0, #112] // ..............................................................................* - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - str q21, [x0, #176] // .............................................................................*. - // gap // ............................................................................... - // gap // ............................................................................... + add v19.8H, v2.8H, v21.8H // ....*......................................................................... + ldr q23, [x0, #256] // .....*........................................................................ + sqrdmulh v22.8H, v10.8H, v1.H[1] // .*............................................................................ + ldr q28, [x0, #64] // *............................................................................. + ldr q3, [x0, #0] // ..*........................................................................... + // gap // .............................................................................. + mul v26.8H, v10.8H, v1.H[0] // ...*.......................................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v27.8H, v13.8H, v7.H[0] // .......*...................................................................... + add v20.8H, v23.8H, v24.8H // ..........*................................................................... + // gap // .............................................................................. + sub v23.8H, v23.8H, v24.8H // ...........*.................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v26.8H, v22.8H, v7.H[0] // .........*.................................................................... + add v22.8H, v3.8H, v28.8H // ......*....................................................................... + // gap // .............................................................................. + sub v24.8H, v20.8H, v15.8H // .............*................................................................ + // gap // .............................................................................. + // gap // .............................................................................. + add v20.8H, v20.8H, v15.8H // ................*............................................................. + sqrdmulh v11.8H, v23.8H, v1.H[3] // .................*............................................................ + // gap // .............................................................................. + sub v14.8H, v22.8H, v19.8H // ..............*............................................................... + // gap // .............................................................................. + // gap // .............................................................................. + add v19.8H, v22.8H, v19.8H // .....................*........................................................ + mul v23.8H, v23.8H, v1.H[2] // ...................*.......................................................... + // gap // .............................................................................. + sub v22.8H, v3.8H, v28.8H // ........*..................................................................... + // gap // .............................................................................. + // gap // .............................................................................. + mul v28.8H, v24.8H, v0.H[4] // ........................*..................................................... + // gap // .............................................................................. + // gap // .............................................................................. + add v3.8H, v19.8H, v20.8H // .........................*.................................................... + // gap // .............................................................................. + // gap // .............................................................................. + sub v19.8H, v19.8H, v20.8H // ...........................*.................................................. + mls v23.8H, v11.8H, v7.H[0] // ....................*......................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v20.8H, v22.8H, v0.H[7] // ............*................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v22.8H, v22.8H, v0.H[6] // ...............*.............................................................. + // gap // .............................................................................. + // gap // .............................................................................. + add v11.8H, v23.8H, v27.8H // ............................*................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v23.8H, v23.8H, v27.8H // ..............................*............................................... + sqrdmulh v27.8H, v24.8H, v0.H[5] // ..........................*................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v22.8H, v20.8H, v7.H[0] // ..................*........................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v24.8H, v14.8H, v0.H[2] // .............................*................................................ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v20.8H, v19.8H, v0.H[0] // ...............................*.............................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v25.8H, v22.8H, v26.8H // ......................*....................................................... + // gap // .............................................................................. + // gap // .............................................................................. + add v22.8H, v22.8H, v26.8H // .......................*...................................................... + sqrdmulh v26.8H, v14.8H, v0.H[3] // ........................................*..................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v14.8H, v23.8H, v0.H[5] // ..................................*........................................... + // gap // .............................................................................. + // gap // .............................................................................. + sub v5.8H, v22.8H, v11.8H // ................................*............................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v19.8H, v19.8H, v0.H[1] // ...................................*.......................................... + add v22.8H, v22.8H, v11.8H // .................................*............................................ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v11.8H, v3.8H, v29.8H // ....................................*......................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v10.8H, v25.8H, v0.H[3] // .....................................*........................................ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v28.8H, v27.8H, v7.H[0] // ..........................................*................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v27.8H, v3.8H, v30.8H // .............................................*................................ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v20.8H, v19.8H, v7.H[0] // ......................................*....................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v19.8H, v25.8H, v0.H[2] // .......................................*...................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v23.8H, v23.8H, v0.H[4] // ...........................................*.................................. + // gap // .............................................................................. + // gap // .............................................................................. + str q20, [x0, #256] // .........................................*.................................... + // gap // .............................................................................. + // gap // .............................................................................. + mls v24.8H, v26.8H, v7.H[0] // ............................................*................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v3.8H, v5.8H, v0.H[0] // ..............................................*............................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v26.8H, v22.8H, v29.8H // .................................................*............................ + // gap // .............................................................................. + // gap // .............................................................................. + sub v20.8H, v24.8H, v28.8H // ...............................................*.............................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v19.8H, v10.8H, v7.H[0] // ....................................................*......................... + add v28.8H, v24.8H, v28.8H // ................................................*............................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v24.8H, v20.8H, v0.H[1] // ..................................................*........................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v20.8H, v20.8H, v0.H[0] // ...................................................*.......................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v23.8H, v14.8H, v7.H[0] // .....................................................*........................ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v11.8H, v27.8H, v7.H[0] // ......................................................*....................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v27.8H, v5.8H, v0.H[1] // .......................................................*...................... + // gap // .............................................................................. + // gap // .............................................................................. + sub v14.8H, v19.8H, v23.8H // ........................................................*..................... + // gap // .............................................................................. + // gap // .............................................................................. + add v19.8H, v19.8H, v23.8H // ..........................................................*................... + mls v20.8H, v24.8H, v7.H[0] // .........................................................*.................... + // gap // .............................................................................. + str q11, [x0], #(16) // ...........................................................*.................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v23.8H, v28.8H, v29.8H // ............................................................*................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v3.8H, v27.8H, v7.H[0] // .............................................................*................ + // gap // .............................................................................. + // gap // .............................................................................. + str q20, [x0, #368] // ..............................................................*............... + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v28.8H, v28.8H, v30.8H // ...............................................................*.............. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v27.8H, v19.8H, v29.8H // ................................................................*............. + // gap // .............................................................................. + // gap // .............................................................................. + str q3, [x0, #304] // .................................................................*............ + // gap // .............................................................................. + // gap // .............................................................................. + mul v24.8H, v14.8H, v0.H[0] // ..................................................................*........... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v3.8H, v14.8H, v0.H[1] // ...................................................................*.......... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v22.8H, v22.8H, v30.8H // ....................................................................*......... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v19.8H, v19.8H, v30.8H // .....................................................................*........ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v24.8H, v3.8H, v7.H[0] // ......................................................................*....... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v26.8H, v22.8H, v7.H[0] // .......................................................................*...... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v23.8H, v28.8H, v7.H[0] // ........................................................................*..... + // gap // .............................................................................. + // gap // .............................................................................. + str q24, [x0, #432] // .........................................................................*.... + // gap // .............................................................................. + // gap // .............................................................................. + mls v27.8H, v19.8H, v7.H[0] // ..........................................................................*... + // gap // .............................................................................. + // gap // .............................................................................. + str q26, [x0, #48] // ...........................................................................*.. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + str q23, [x0, #112] // ............................................................................*. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + str q27, [x0, #176] // .............................................................................* + // gap // .............................................................................. + // gap // .............................................................................. // original source code - // sub v16.8H, v16.8H, v4.8H // *.............................................................................. - // sub v4.8H, v23.8H, v22.8H // ....*.......................................................................... - // sqrdmulh v2.8H, v2.8H, v1.H[5] // .*............................................................................. - // add v23.8H, v23.8H, v22.8H // ......*........................................................................ - // mul v19.8H, v16.8H, v0.H[6] // .....*......................................................................... - // add v20.8H, v6.8H, v3.8H // ...*........................................................................... - // add v17.8H, v17.8H, v25.8H // ..*............................................................................ - // sqrdmulh v27.8H, v16.8H, v0.H[7] // ........*...................................................................... - // sub v25.8H, v23.8H, v20.8H // ..........*.................................................................... - // add v24.8H, v23.8H, v20.8H // ...........*................................................................... - // mul v16.8H, v4.8H, v1.H[2] // ................*.............................................................. - // add v20.8H, v28.8H, v17.8H // .........*..................................................................... - // sqrdmulh v21.8H, v4.8H, v1.H[3] // ..............*................................................................ - // mls v15.8H, v2.8H, v7.H[0] // ............*.................................................................. - // sub v2.8H, v28.8H, v17.8H // .......*....................................................................... - // mls v19.8H, v27.8H, v7.H[0] // ...................*........................................................... - // mls v16.8H, v21.8H, v7.H[0] // .................*............................................................. - // mul v4.8H, v2.8H, v0.H[2] // ........................*...................................................... - // sub v26.8H, v19.8H, v13.8H // .........................*..................................................... - // add v9.8H, v19.8H, v13.8H // .......................*....................................................... - // mul v18.8H, v25.8H, v0.H[4] // .................................*............................................. - // sub v10.8H, v16.8H, v15.8H // ...........................*................................................... - // sqrdmulh v31.8H, v25.8H, v0.H[5] // ....................................*.......................................... - // add v16.8H, v16.8H, v15.8H // ............................*.................................................. - // sqrdmulh v2.8H, v2.8H, v0.H[3] // .....................*......................................................... - // sub v25.8H, v9.8H, v16.8H // ...............................*............................................... - // add v16.8H, v9.8H, v16.8H // ................................*.............................................. - // sqdmulh v21.8H, v20.8H, v7.H[1] // .............*................................................................. - // sqdmulh v23.8H, v24.8H, v7.H[1] // ...............*............................................................... - // mls v4.8H, v2.8H, v7.H[0] // .............................*................................................. - // srshr v2.8H, v21.8H, #11 // ....................*.......................................................... - // mul v17.8H, v26.8H, v0.H[2] // ..............................*................................................ - // srshr v23.8H, v23.8H, #11 // ..................*............................................................ - // mls v20.8H, v2.8H, v7.H[0] // ......................*........................................................ - // mls v24.8H, v23.8H, v7.H[0] // ..........................*.................................................... - // sqrdmulh v22.8H, v26.8H, v0.H[3] // .....................................*......................................... - // mls v18.8H, v31.8H, v7.H[0] // .........................................*..................................... - // sub v2.8H, v20.8H, v24.8H // ..................................*............................................ - // add v5.8H, v20.8H, v24.8H // ...................................*........................................... - // mul v21.8H, v10.8H, v0.H[4] // ......................................*........................................ - // mls v17.8H, v22.8H, v7.H[0] // ........................................*...................................... - // sub v20.8H, v4.8H, v18.8H // ............................................*.................................. - // sqrdmulh v31.8H, v10.8H, v0.H[5] // .......................................*....................................... - // add v26.8H, v4.8H, v18.8H // ..............................................*................................ - // mul v11.8H, v2.8H, v0.H[0] // .............................................*................................. - // sqrdmulh v2.8H, v2.8H, v0.H[1] // ...............................................*............................... - // mls v21.8H, v31.8H, v7.H[0] // ..........................................*.................................... - // sqrdmulh v4.8H, v25.8H, v0.H[1] // ...........................................*................................... - // mul v25.8H, v25.8H, v0.H[0] // ................................................*.............................. - // mls v11.8H, v2.8H, v7.H[0] // ....................................................*.......................... - // sub v2.8H, v17.8H, v21.8H // ..................................................*............................ - // add v21.8H, v17.8H, v21.8H // ...................................................*........................... - // mls v25.8H, v4.8H, v7.H[0] // .................................................*............................. - // mul v17.8H, v20.8H, v0.H[0] // ......................................................*........................ - // str q11, [x0, #256] // ..........................................................*.................... - // sqrdmulh v10.8H, v20.8H, v0.H[1] // .......................................................*....................... - // str q25, [x0, #320] // ...........................................................*................... - // mul v4.8H, v2.8H, v0.H[0] // ........................................................*...................... - // sqrdmulh v2.8H, v2.8H, v0.H[1] // .....................................................*......................... - // mls v17.8H, v10.8H, v7.H[0] // ............................................................*.................. - // mul v20.8H, v5.8H, v29.8H // .............................................................*................. - // mls v4.8H, v2.8H, v7.H[0] // .........................................................*..................... - // str q17, [x0, #384] // ...............................................................*............... - // sqrdmulh v28.8H, v5.8H, v30.8H // ..............................................................*................ - // mul v2.8H, v16.8H, v29.8H // ................................................................*.............. - // str q4, [x0, #448] // ....................................................................*.......... - // sqrdmulh v16.8H, v16.8H, v30.8H // .................................................................*............. - // mls v20.8H, v28.8H, v7.H[0] // ..................................................................*............ - // mul v23.8H, v26.8H, v29.8H // ...................................................................*........... - // mls v2.8H, v16.8H, v7.H[0] // .....................................................................*......... - // str q20, [x0], #(16) // ......................................................................*........ - // sqrdmulh v26.8H, v26.8H, v30.8H // .......................................................................*....... - // sqrdmulh v16.8H, v21.8H, v30.8H // ........................................................................*...... - // str q2, [x0, #48] // .........................................................................*..... - // mul v21.8H, v21.8H, v29.8H // ..........................................................................*.... - // mls v23.8H, v26.8H, v7.H[0] // ...........................................................................*... - // mls v21.8H, v16.8H, v7.H[0] // ............................................................................*.. - // str q21, [x0, #176] // ..............................................................................* - // str q23, [x0, #112] // .............................................................................*. + // ldr q28, [x0, #64] // ...*.......................................................................... + // sqrdmulh v9.8H, v10.8H, v1.H[1] // ..*........................................................................... + // ldr q23, [x0, #0] // ....*......................................................................... + // mul v3.8H, v10.8H, v1.H[0] // .....*........................................................................ + // add v8.8H, v2.8H, v21.8H // *............................................................................. + // ldr q22, [x0, #256] // .*............................................................................ + // add v25.8H, v23.8H, v28.8H // ..........*................................................................... + // mls v27.8H, v13.8H, v7.H[0] // ......*....................................................................... + // sub v20.8H, v23.8H, v28.8H // .................*............................................................ + // mls v3.8H, v9.8H, v7.H[0] // .........*.................................................................... + // add v23.8H, v22.8H, v24.8H // .......*...................................................................... + // sub v28.8H, v22.8H, v24.8H // ........*..................................................................... + // sqrdmulh v17.8H, v20.8H, v0.H[7] // ......................*....................................................... + // sub v19.8H, v23.8H, v15.8H // ...........*.................................................................. + // sub v12.8H, v25.8H, v8.8H // ..............*............................................................... + // mul v21.8H, v20.8H, v0.H[6] // .......................*...................................................... + // add v31.8H, v23.8H, v15.8H // ............*................................................................. + // sqrdmulh v23.8H, v28.8H, v1.H[3] // .............*................................................................ + // mls v21.8H, v17.8H, v7.H[0] // ...........................*.................................................. + // mul v14.8H, v28.8H, v1.H[2] // ................*............................................................. + // mls v14.8H, v23.8H, v7.H[0] // .....................*........................................................ + // add v23.8H, v25.8H, v8.8H // ...............*.............................................................. + // sub v28.8H, v21.8H, v3.8H // ..............................*............................................... + // add v3.8H, v21.8H, v3.8H // ...............................*.............................................. + // mul v21.8H, v19.8H, v0.H[4] // ..................*........................................................... + // add v24.8H, v23.8H, v31.8H // ...................*.......................................................... + // sqrdmulh v13.8H, v19.8H, v0.H[5] // ..........................*................................................... + // sub v23.8H, v23.8H, v31.8H // ....................*......................................................... + // add v22.8H, v14.8H, v27.8H // ........................*..................................................... + // mul v6.8H, v12.8H, v0.H[2] // ............................*................................................. + // sub v17.8H, v14.8H, v27.8H // .........................*.................................................... + // mul v20.8H, v23.8H, v0.H[0] // .............................*................................................ + // sub v10.8H, v3.8H, v22.8H // ..................................*........................................... + // add v9.8H, v3.8H, v22.8H // ....................................*......................................... + // sqrdmulh v14.8H, v17.8H, v0.H[5] // .................................*............................................ + // sqrdmulh v19.8H, v23.8H, v0.H[1] // ...................................*.......................................... + // mul v26.8H, v24.8H, v29.8H // .....................................*........................................ + // sqrdmulh v31.8H, v28.8H, v0.H[3] // ......................................*....................................... + // mls v20.8H, v19.8H, v7.H[0] // .........................................*.................................... + // mul v4.8H, v28.8H, v0.H[2] // ..........................................*................................... + // sqrdmulh v25.8H, v12.8H, v0.H[3] // ................................*............................................. + // str q20, [x0, #256] // ............................................*................................. + // mls v21.8H, v13.8H, v7.H[0] // .......................................*...................................... + // mul v22.8H, v17.8H, v0.H[4] // ...........................................*.................................. + // mls v6.8H, v25.8H, v7.H[0] // .............................................*................................ + // sqrdmulh v19.8H, v24.8H, v30.8H // ........................................*..................................... + // mul v11.8H, v10.8H, v0.H[0] // ..............................................*............................... + // sub v8.8H, v6.8H, v21.8H // ................................................*............................. + // add v25.8H, v6.8H, v21.8H // ..................................................*........................... + // mul v24.8H, v9.8H, v29.8H // ...............................................*.............................. + // sqrdmulh v3.8H, v8.8H, v0.H[1] // ...................................................*.......................... + // mul v20.8H, v8.8H, v0.H[0] // ....................................................*......................... + // mls v4.8H, v31.8H, v7.H[0] // .................................................*............................ + // mls v22.8H, v14.8H, v7.H[0] // .....................................................*........................ + // mls v26.8H, v19.8H, v7.H[0] // ......................................................*....................... + // sqrdmulh v19.8H, v10.8H, v0.H[1] // .......................................................*...................... + // sub v23.8H, v4.8H, v22.8H // ........................................................*..................... + // mls v20.8H, v3.8H, v7.H[0] // ..........................................................*................... + // add v14.8H, v4.8H, v22.8H // .........................................................*.................... + // str q26, [x0], #(16) // ...........................................................*.................. + // mul v28.8H, v25.8H, v29.8H // ............................................................*................. + // mls v11.8H, v19.8H, v7.H[0] // .............................................................*................ + // str q20, [x0, #368] // ..............................................................*............... + // sqrdmulh v12.8H, v25.8H, v30.8H // ...............................................................*.............. + // mul v22.8H, v14.8H, v29.8H // ................................................................*............. + // str q11, [x0, #304] // .................................................................*............ + // mul v19.8H, v23.8H, v0.H[0] // ..................................................................*........... + // sqrdmulh v6.8H, v23.8H, v0.H[1] // ...................................................................*.......... + // sqrdmulh v11.8H, v9.8H, v30.8H // ....................................................................*......... + // sqrdmulh v8.8H, v14.8H, v30.8H // .....................................................................*........ + // mls v19.8H, v6.8H, v7.H[0] // ......................................................................*....... + // mls v24.8H, v11.8H, v7.H[0] // .......................................................................*...... + // mls v28.8H, v12.8H, v7.H[0] // ........................................................................*..... + // str q19, [x0, #432] // .........................................................................*.... + // mls v22.8H, v8.8H, v7.H[0] // ..........................................................................*... + // str q24, [x0, #48] // ...........................................................................*.. + // str q28, [x0, #112] // ............................................................................*. + // str q22, [x0, #176] // .............................................................................* pop_stack diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s index a8f3f08..ea0a6c4 100644 --- a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s @@ -354,23 +354,20 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: mov count, #8 .p2align 2 - ldr q0, [x4, #64] // ....*............................................. - ldr q14, [x4, #16] // ........*......................................... - ldr q6, [x4, #80] // ...*.............................................. - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [x1] // .*................................................ + ldr q13, [x3], #16 // ...................................*.............. + ldr q28, [x4, #48] // ...*.............................................. + ldr q3, [x4, #16] // ..*............................................... // gap // .................................................. // gap // .................................................. // gap // .................................................. // gap // .................................................. - ldr q9, [x4, #48] // *................................................. - ldr q2, [x4], #(6*16) // .....*............................................ - ldr q3, [x3], #16 // ...................................*.............. + ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1] // *................................................. + ldr q12, [x4, #80] // ....*............................................. // gap // .................................................. // gap // .................................................. // gap // .................................................. // gap // .................................................. // gap // .................................................. - ldr q11, [x4, #-64] // ..*............................................... // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -395,6 +392,8 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // gap // .................................................. // gap // .................................................. + ldr q22, [x4, #32] // .....*............................................ + ldr q26, [x4, #64] // .*................................................ // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -402,11 +401,11 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // gap // .................................................. // gap // .................................................. - add v22.8H, v18.8H, v19.8H // .......*.......................................... - sub v15.8H, v18.8H, v19.8H // ......*........................................... - add v31.8H, v16.8H, v17.8H // .........*........................................ - sub v5.8H, v16.8H, v17.8H // ..........*....................................... // gap // .................................................. + sub v8.8H, v14.8H, v15.8H // .........*........................................ + add v0.8H, v14.8H, v15.8H // ..........*....................................... + sub v23.8H, v16.8H, v17.8H // .......*.......................................... + add v27.8H, v16.8H, v17.8H // ........*......................................... // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -418,15 +417,17 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // gap // .................................................. // gap // .................................................. - mul v18.8H, v15.8H, v0.8H // ...........*...................................... - sqrdmulh v21.8H, v15.8H, v6.8H // ..............*................................... - mul v27.8H, v5.8H, v11.8H // .............*.................................... - sqrdmulh v11.8H, v5.8H, v9.8H // ............*..................................... + ldr q24, [x4], #(6*16) // ......*........................................... + sqrdmulh v19.8H, v8.8H, v28.8H // .............*.................................... + sqrdmulh v10.8H, v23.8H, v12.8H // ...........*...................................... + mul v25.8H, v23.8H, v26.8H // ............*..................................... + mul v21.8H, v8.8H, v22.8H // ..............*................................... // gap // .................................................. // gap // .................................................. // gap // .................................................. // gap // .................................................. - sub v16.8H, v31.8H, v22.8H // ...............*.................................. + add v6.8H, v0.8H, v27.8H // ................*................................. + sub v9.8H, v0.8H, v27.8H // ...............*.................................. // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -434,7 +435,6 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // gap // .................................................. // gap // .................................................. - add v0.8H, v31.8H, v22.8H // ....................*............................. // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -442,15 +442,15 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // gap // .................................................. // gap // .................................................. - mls v27.8H, v11.8H, v7.H[0] // ...................*.............................. - mls v18.8H, v21.8H, v7.H[0] // ..................*............................... + mul v22.8H, v9.8H, v24.8H // .................*................................ + sqrdmulh v30.8H, v9.8H, v3.8H // ..................*............................... + mls v25.8H, v10.8H, v7.H[0] // ....................*............................. + mls v21.8H, v19.8H, v7.H[0] // ...................*.............................. // gap // .................................................. // gap // .................................................. // gap // .................................................. // gap // .................................................. // gap // .................................................. - mul v8.8H, v16.8H, v2.8H // .................*................................ - sqrdmulh v9.8H, v16.8H, v14.8H // ................*................................. // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -466,15 +466,15 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // gap // .................................................. // gap // .................................................. - add v25.8H, v27.8H, v18.8H // ......................*........................... - sub v28.8H, v27.8H, v18.8H // .....................*............................ + add v28.8H, v21.8H, v25.8H // ......................*........................... + sub v17.8H, v21.8H, v25.8H // .....................*............................ + mls v22.8H, v30.8H, v7.H[0] // .......................*.......................... // gap // .................................................. // gap // .................................................. // gap // .................................................. // gap // .................................................. // gap // .................................................. // gap // .................................................. - mls v8.8H, v9.8H, v7.H[0] // .......................*.......................... // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -482,10 +482,10 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // gap // .................................................. // gap // .................................................. - trn1 v22.4S, v0.4S, v25.4S // .........................*........................ - trn2 v20.4S, v0.4S, v25.4S // ........................*......................... - sqrdmulh v18.8H, v28.8H, v14.8H // ...........................*...................... - mul v29.8H, v28.8H, v2.8H // ..........................*....................... + trn2 v23.4S, v6.4S, v28.4S // ........................*......................... + trn1 v0.4S, v6.4S, v28.4S // ...........................*...................... + sqrdmulh v19.8H, v17.8H, v3.8H // ..........................*....................... + mul v29.8H, v17.8H, v24.8H // .........................*........................ // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -506,7 +506,7 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // gap // .................................................. // gap // .................................................. - mls v29.8H, v18.8H, v7.H[0] // ............................*..................... + mls v29.8H, v19.8H, v7.H[0] // ............................*..................... // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -530,8 +530,8 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // gap // .................................................. // gap // .................................................. - trn1 v4.4S, v8.4S, v29.4S // ..............................*................... - trn2 v29.4S, v8.4S, v29.4S // .............................*.................... + trn1 v8.4S, v22.4S, v29.4S // ..............................*................... + trn2 v31.4S, v22.4S, v29.4S // .............................*.................... // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -546,14 +546,14 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // gap // .................................................. // gap // .................................................. - trn2 v8.2D, v22.2D, v4.2D // ................................*................. - trn1 v1.2D, v22.2D, v4.2D // ...............................*.................. - trn2 v12.2D, v20.2D, v29.2D // ..................................*............... - trn1 v19.2D, v20.2D, v29.2D // .................................*................ + trn2 v26.2D, v0.2D, v8.2D // .................................*................ + trn1 v8.2D, v0.2D, v8.2D // ..................................*............... + trn2 v5.2D, v23.2D, v31.2D // ...............................*.................. // gap // .................................................. // gap // .................................................. // gap // .................................................. // gap // .................................................. + trn1 v22.2D, v23.2D, v31.2D // ................................*................. // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -562,10 +562,9 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // gap // .................................................. // gap // .................................................. - sub v6.8H, v8.8H, v12.8H // ......................................*........... - add v26.8H, v8.8H, v12.8H // .....................................*............ - sub v23.8H, v1.8H, v19.8H // ...............................................*.. - add v17.8H, v1.8H, v19.8H // ....................................*............. + add v19.8H, v8.8H, v22.8H // .....................................*............ + add v28.8H, v26.8H, v5.8H // ....................................*............. + sub v23.8H, v8.8H, v22.8H // ........................................*......... // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -578,15 +577,15 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // gap // .................................................. // gap // .................................................. - mul v27.8H, v6.8H, v3.H[4] // .........................................*........ - sqrdmulh v5.8H, v6.8H, v3.H[5] // ..........................................*....... // gap // .................................................. + mul v31.8H, v23.8H, v13.H[2] // ...........................................*...... + sqrdmulh v10.8H, v23.8H, v13.H[3] // ..............................................*... // gap // .................................................. // gap // .................................................. // gap // .................................................. - sqdmulh v24.8H, v26.8H, v7.H[1] // ........................................*......... - sqdmulh v8.8H, v17.8H, v7.H[1] // .......................................*.......... // gap // .................................................. + sqdmulh v22.8H, v28.8H, v7.H[1] // ......................................*........... + sqdmulh v23.8H, v19.8H, v7.H[1] // .......................................*.......... // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -602,9 +601,9 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // gap // .................................................. // gap // .................................................. - srshr v14.8H, v8.8H, #11 // ...........................................*...... - srshr v6.8H, v24.8H, #11 // ............................................*..... // gap // .................................................. + srshr v21.8H, v23.8H, #11 // .........................................*........ + srshr v24.8H, v22.8H, #11 // ..........................................*....... // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -626,9 +625,9 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // gap // .................................................. // gap // .................................................. - mls v26.8H, v6.8H, v7.H[0] // ..............................................*... - mls v17.8H, v14.8H, v7.H[0] // .............................................*.... // gap // .................................................. + mls v19.8H, v21.8H, v7.H[0] // ............................................*..... + mls v28.8H, v24.8H, v7.H[0] // .............................................*.... // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -650,8 +649,9 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // gap // .................................................. // gap // .................................................. - add v11.8H, v17.8H, v26.8H // ................................................*. // gap // .................................................. + sub v0.8H, v19.8H, v28.8H // ................................................*. + add v9.8H, v19.8H, v28.8H // ...............................................*.. // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -666,7 +666,7 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // gap // .................................................. // gap // .................................................. - str q11, [x1], #(64) // .................................................* + str q9, [x1], #(64) // .................................................* // gap // .................................................. // gap // .................................................. // gap // .................................................. @@ -676,588 +676,688 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: // gap // .................................................. // original source code - // ldr q22, [x4, #48] // ....*............................................. - // ld4 {v11.4S, v12.4S, v13.4S, v14.4S}, [x1] // ...*.............................................. - // ldr q6, [x4, #32] // .......*.......................................... - // ldr q25, [x4, #80] // ..*............................................... - // ldr q19, [x4, #64] // *................................................. - // ldr q15, [x4], #(6*16) // .....*............................................ - // sub v8.8H, v13.8H, v14.8H // .........*........................................ - // add v17.8H, v13.8H, v14.8H // ........*......................................... - // ldr q18, [x4, #-80] // .*................................................ - // add v16.8H, v11.8H, v12.8H // ..........*....................................... - // sub v1.8H, v11.8H, v12.8H // ...........*...................................... - // mul v21.8H, v8.8H, v19.8H // ............*..................................... - // sqrdmulh v0.8H, v1.8H, v22.8H // ...............*.................................. - // mul v13.8H, v1.8H, v6.8H // ..............*................................... - // sqrdmulh v23.8H, v8.8H, v25.8H // .............*.................................... - // sub v24.8H, v16.8H, v17.8H // ................*................................. - // sqrdmulh v6.8H, v24.8H, v18.8H // .....................*............................ - // mul v1.8H, v24.8H, v15.8H // ....................*............................. - // mls v21.8H, v23.8H, v7.H[0] // ...................*.............................. - // mls v13.8H, v0.8H, v7.H[0] // ..................*............................... - // add v0.8H, v16.8H, v17.8H // .................*................................ - // sub v9.8H, v13.8H, v21.8H // .......................*.......................... - // add v14.8H, v13.8H, v21.8H // ......................*........................... - // mls v1.8H, v6.8H, v7.H[0] // ........................*......................... - // trn2 v11.4S, v0.4S, v14.4S // ..........................*....................... - // trn1 v2.4S, v0.4S, v14.4S // .........................*........................ - // mul v25.8H, v9.8H, v15.8H // ............................*..................... - // sqrdmulh v23.8H, v9.8H, v18.8H // ...........................*...................... - // mls v25.8H, v23.8H, v7.H[0] // .............................*.................... - // trn2 v3.4S, v1.4S, v25.4S // ...............................*.................. - // trn1 v6.4S, v1.4S, v25.4S // ..............................*................... - // trn1 v4.2D, v2.2D, v6.2D // .................................*................ - // trn2 v13.2D, v2.2D, v6.2D // ................................*................. - // trn1 v30.2D, v11.2D, v3.2D // ...................................*.............. - // trn2 v25.2D, v11.2D, v3.2D // ..................................*............... - // ldr q3, [x3], #16 // ......*........................................... - // add v17.8H, v4.8H, v30.8H // .......................................*.......... - // add v26.8H, v13.8H, v25.8H // .....................................*............ - // sub v16.8H, v13.8H, v25.8H // ....................................*............. - // sqdmulh v23.8H, v17.8H, v7.H[1] // ...........................................*...... - // sqdmulh v20.8H, v26.8H, v7.H[1] // ..........................................*....... - // mul v27.8H, v16.8H, v3.H[4] // ........................................*......... - // sqrdmulh v5.8H, v16.8H, v3.H[5] // .........................................*........ - // srshr v23.8H, v23.8H, #11 // ............................................*..... - // srshr v16.8H, v20.8H, #11 // .............................................*.... - // mls v17.8H, v23.8H, v7.H[0] // ...............................................*.. - // mls v26.8H, v16.8H, v7.H[0] // ..............................................*... - // sub v23.8H, v4.8H, v30.8H // ......................................*........... - // add v2.8H, v17.8H, v26.8H // ................................................*. - // str q2, [x1], #(64) // .................................................* + // ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1] // ...*.............................................. + // ldr q28, [x4, #64] // ......*........................................... + // ldr q6, [x4, #16] // ..*............................................... + // ldr q24, [x4, #48] // .*................................................ + // ldr q25, [x4, #80] // ....*............................................. + // ldr q10, [x4, #32] // .....*............................................ + // ldr q2, [x4], #(6*16) // ...........*...................................... + // sub v5.8H, v16.8H, v17.8H // .........*........................................ + // add v21.8H, v16.8H, v17.8H // ..........*....................................... + // sub v27.8H, v14.8H, v15.8H // .......*.......................................... + // add v16.8H, v14.8H, v15.8H // ........*......................................... + // sqrdmulh v23.8H, v5.8H, v25.8H // .............*.................................... + // mul v11.8H, v5.8H, v28.8H // ..............*................................... + // sqrdmulh v20.8H, v27.8H, v24.8H // ............*..................................... + // mul v9.8H, v27.8H, v10.8H // ...............*.................................. + // sub v25.8H, v16.8H, v21.8H // .................*................................ + // add v18.8H, v16.8H, v21.8H // ................*................................. + // mul v29.8H, v25.8H, v2.8H // ..................*............................... + // sqrdmulh v14.8H, v25.8H, v6.8H // ...................*.............................. + // mls v9.8H, v20.8H, v7.H[0] // .....................*............................ + // mls v11.8H, v23.8H, v7.H[0] // ....................*............................. + // sub v30.8H, v9.8H, v11.8H // .......................*.......................... + // add v5.8H, v9.8H, v11.8H // ......................*........................... + // mls v29.8H, v14.8H, v7.H[0] // ........................*......................... + // trn2 v21.4S, v18.4S, v5.4S // .........................*........................ + // mul v12.8H, v30.8H, v2.8H // ............................*..................... + // sqrdmulh v22.8H, v30.8H, v6.8H // ...........................*...................... + // trn1 v28.4S, v18.4S, v5.4S // ..........................*....................... + // mls v12.8H, v22.8H, v7.H[0] // .............................*.................... + // trn2 v22.4S, v29.4S, v12.4S // ...............................*.................. + // trn1 v25.4S, v29.4S, v12.4S // ..............................*................... + // trn2 v5.2D, v21.2D, v22.2D // ..................................*............... + // trn1 v24.2D, v21.2D, v22.2D // ...................................*.............. + // trn2 v26.2D, v28.2D, v25.2D // ................................*................. + // trn1 v25.2D, v28.2D, v25.2D // .................................*................ + // ldr q13, [x3], #16 // *................................................. + // add v20.8H, v26.8H, v5.8H // .....................................*............ + // add v8.8H, v25.8H, v24.8H // ....................................*............. + // sqdmulh v10.8H, v20.8H, v7.H[1] // .........................................*........ + // sqdmulh v11.8H, v8.8H, v7.H[1] // ..........................................*....... + // sub v23.8H, v25.8H, v24.8H // ......................................*........... + // srshr v27.8H, v11.8H, #11 // ...........................................*...... + // srshr v10.8H, v10.8H, #11 // ............................................*..... + // mul v31.8H, v23.8H, v13.H[2] // .......................................*.......... + // mls v8.8H, v27.8H, v7.H[0] // .............................................*.... + // mls v20.8H, v10.8H, v7.H[0] // ..............................................*... + // sqrdmulh v10.8H, v23.8H, v13.H[3] // ........................................*......... + // add v28.8H, v8.8H, v20.8H // ................................................*. + // sub v0.8H, v8.8H, v20.8H // ...............................................*.. + // str q28, [x1], #(64) // .................................................* sub count, count, #1 layer4567_start: - ldr q22, [x4, #48] // ....e............................................................. - ld4 {v11.4S, v12.4S, v13.4S, v14.4S}, [x1] // e................................................................. - mls v27.8H, v5.8H, v7.H[0] // .............................................*.................... - sqrdmulh v5.8H, v23.8H, v3.H[3] // .......................................*.......................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sub v2.8H, v17.8H, v26.8H // ....................................................*............. - ldr q6, [x4, #32] // ...e.............................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mul v20.8H, v23.8H, v3.H[2] // ......................................*........................... - ldr q25, [x4, #80] // ......e........................................................... - mul v31.8H, v2.8H, v3.H[0] // ......................................................*........... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - ldr q19, [x4, #64] // .....e............................................................ - sqrdmulh v9.8H, v2.8H, v3.H[1] // .......................................................*.......... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mls v20.8H, v5.8H, v7.H[0] // ........................................*......................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - ldr q15, [x4], #(6*16) // .e................................................................ - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sub v8.8H, v13.8H, v14.8H // ............e..................................................... - add v17.8H, v13.8H, v14.8H // .............e.................................................... - ldr q18, [x4, #-80] // ..e............................................................... - add v16.8H, v11.8H, v12.8H // ........e......................................................... - sub v1.8H, v11.8H, v12.8H // .......e.......................................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mls v31.8H, v9.8H, v7.H[0] // ........................................................*......... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sub v2.8H, v20.8H, v27.8H // .........................................................*........ - mul v21.8H, v8.8H, v19.8H // ..............e................................................... - sqrdmulh v0.8H, v1.8H, v22.8H // ..........e....................................................... - mul v13.8H, v1.8H, v6.8H // .........e........................................................ - sqrdmulh v23.8H, v8.8H, v25.8H // ...............e.................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sqrdmulh v29.8H, v2.8H, v3.H[1] // ............................................................*..... - mul v8.8H, v2.8H, v3.H[0] // ...........................................................*...... - sub v24.8H, v16.8H, v17.8H // .................e................................................ - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sqrdmulh v6.8H, v24.8H, v18.8H // ....................e............................................. - mul v1.8H, v24.8H, v15.8H // ...................e.............................................. - mls v21.8H, v23.8H, v7.H[0] // ................e................................................. - mls v13.8H, v0.8H, v7.H[0] // ...........e...................................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - add v0.8H, v16.8H, v17.8H // ..................e............................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mls v8.8H, v29.8H, v7.H[0] // .............................................................*.... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sub v9.8H, v13.8H, v21.8H // ......................e........................................... - add v14.8H, v13.8H, v21.8H // .......................e.......................................... - mls v1.8H, v6.8H, v7.H[0] // .....................e............................................ - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - trn2 v11.4S, v0.4S, v14.4S // ............................e..................................... - trn1 v2.4S, v0.4S, v14.4S // ...........................e...................................... - mul v25.8H, v9.8H, v15.8H // ........................e......................................... - sqrdmulh v23.8H, v9.8H, v18.8H // .........................e........................................ - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - add v0.8H, v20.8H, v27.8H // ..........................................................*....... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mls v25.8H, v23.8H, v7.H[0] // ..........................e....................................... - str q0, [x1, #-48] // ...............................................................*.. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - str q31, [x1, #-32] // ................................................................*. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - str q8, [x1, #-16] // .................................................................* - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - trn2 v3.4S, v1.4S, v25.4S // ..............................e................................... - trn1 v6.4S, v1.4S, v25.4S // .............................e.................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - trn1 v4.2D, v2.2D, v6.2D // .................................e................................ - trn2 v13.2D, v2.2D, v6.2D // ...............................e.................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - trn1 v30.2D, v11.2D, v3.2D // ..................................e............................... - trn2 v25.2D, v11.2D, v3.2D // ................................e................................. - ldr q3, [x3], #16 // ...................................e.............................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - add v17.8H, v4.8H, v30.8H // .....................................e............................ - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - add v26.8H, v13.8H, v25.8H // ..........................................e....................... - sub v16.8H, v13.8H, v25.8H // .........................................e........................ - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sqdmulh v23.8H, v17.8H, v7.H[1] // ..............................................e................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sqdmulh v20.8H, v26.8H, v7.H[1] // .................................................e................ - mul v27.8H, v16.8H, v3.H[4] // ...........................................e...................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sqrdmulh v5.8H, v16.8H, v3.H[5] // ............................................e..................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - srshr v23.8H, v23.8H, #11 // ...............................................e.................. - srshr v16.8H, v20.8H, #11 // ..................................................e............... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mls v17.8H, v23.8H, v7.H[0] // ................................................e................. - mls v26.8H, v16.8H, v7.H[0] // ...................................................e.............. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sub v23.8H, v4.8H, v30.8H // ....................................e............................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - add v2.8H, v17.8H, v26.8H // .....................................................e............ - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - str q2, [x1], #(64) // ..............................................................e... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. + ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1] // e....................................................................... + ldr q28, [x4, #64] // .....e.................................................................. + sqrdmulh v25.8H, v0.8H, v13.H[1] // .............................................................*.......... + mul v19.8H, v0.8H, v13.H[0] // ............................................................*........... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v31.8H, v10.8H, v7.H[0] // ........................................*............................... + ldr q6, [x4, #16] // ..e..................................................................... + sub v8.8H, v26.8H, v5.8H // .........................................*.............................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q24, [x4, #48] // ....e................................................................... + mls v19.8H, v25.8H, v7.H[0] // ..............................................................*......... + sqdmulh v26.8H, v31.8H, v7.H[1] // .................................................*...................... + ldr q25, [x4, #80] // ......e................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v3.8H, v8.8H, v13.H[4] // ...........................................*............................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v30.8H, v8.8H, v13.H[5] // ............................................*........................... + ldr q10, [x4, #32] // ...e.................................................................... + ldr q2, [x4], #(6*16) // .e...................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v5.8H, v16.8H, v17.8H // ............e........................................................... + add v21.8H, v16.8H, v17.8H // .............e.......................................................... + srshr v1.8H, v26.8H, #11 // ..................................................*..................... + sub v27.8H, v14.8H, v15.8H // .......e................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v16.8H, v14.8H, v15.8H // ........e............................................................... + mls v3.8H, v30.8H, v7.H[0] // .............................................*.......................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v23.8H, v5.8H, v25.8H // ...............e........................................................ + mul v11.8H, v5.8H, v28.8H // ..............e......................................................... + sqrdmulh v20.8H, v27.8H, v24.8H // ..........e............................................................. + mul v9.8H, v27.8H, v10.8H // .........e.............................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v31.8H, v1.8H, v7.H[0] // ...................................................*.................... + sub v25.8H, v16.8H, v21.8H // .................e...................................................... + add v18.8H, v16.8H, v21.8H // ..................e..................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v17.8H, v3.8H, v7.H[1] // .......................................................*................ + mul v29.8H, v25.8H, v2.8H // ...................e.................................................... + sqrdmulh v14.8H, v25.8H, v6.8H // ....................e................................................... + mls v9.8H, v20.8H, v7.H[0] // ...........e............................................................ + mls v11.8H, v23.8H, v7.H[0] // ................e....................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + srshr v23.8H, v17.8H, #11 // ........................................................*............... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v30.8H, v9.8H, v11.8H // ......................e................................................. + add v5.8H, v9.8H, v11.8H // .......................e................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v29.8H, v14.8H, v7.H[0] // .....................e.................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v21.4S, v18.4S, v5.4S // ............................e........................................... + mul v12.8H, v30.8H, v2.8H // ........................e............................................... + sqrdmulh v22.8H, v30.8H, v6.8H // .........................e.............................................. + mls v3.8H, v23.8H, v7.H[0] // .........................................................*.............. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v28.4S, v18.4S, v5.4S // ...........................e............................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v12.8H, v22.8H, v7.H[0] // ..........................e............................................. + add v23.8H, v31.8H, v3.8H // ................................................................*....... + sub v8.8H, v31.8H, v3.8H // ...............................................................*........ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q23, [x1, #-48] // .....................................................................*.. + sqrdmulh v23.8H, v8.8H, v13.H[1] // ..................................................................*..... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v22.4S, v29.4S, v12.4S // ..............................e......................................... + trn1 v25.4S, v29.4S, v12.4S // .............................e.......................................... + str q19, [x1, #-32] // ......................................................................*. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v5.2D, v21.2D, v22.2D // ................................e....................................... + trn1 v24.2D, v21.2D, v22.2D // ..................................e..................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v26.2D, v28.2D, v25.2D // ...............................e........................................ + trn1 v25.2D, v28.2D, v25.2D // .................................e...................................... + mul v22.8H, v8.8H, v13.H[0] // .................................................................*...... + ldr q13, [x3], #16 // ...................................e.................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v20.8H, v26.8H, v5.8H // ..........................................e............................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v8.8H, v25.8H, v24.8H // .....................................e.................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v10.8H, v20.8H, v7.H[1] // ....................................................e................... + sqdmulh v11.8H, v8.8H, v7.H[1] // ..............................................e......................... + mls v22.8H, v23.8H, v7.H[0] // ...................................................................*.... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v23.8H, v25.8H, v24.8H // ....................................e................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + srshr v27.8H, v11.8H, #11 // ...............................................e........................ + srshr v10.8H, v10.8H, #11 // .....................................................e.................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q22, [x1, #-16] // .......................................................................* + mul v31.8H, v23.8H, v13.H[2] // ......................................e................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v8.8H, v27.8H, v7.H[0] // ................................................e....................... + mls v20.8H, v10.8H, v7.H[0] // ......................................................e................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v10.8H, v23.8H, v13.H[3] // .......................................e................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v28.8H, v8.8H, v20.8H // ...........................................................e............ + sub v0.8H, v8.8H, v20.8H // ..........................................................e............. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q28, [x1], #(64) // ....................................................................e... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ // original source code - // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // .e................................................................|e........................................... - // ldr q0, [x4], #(6*16) // ............e.....................................................|...........e................................ - // ldr q4, [x4, #(-6*16 + 1*16)] // ...............e..................................................|..............e............................. - // ldr q1, [x4, #(-6*16 + 2*16)] // .....e............................................................|....e....................................... - // ldr q5, [x4, #(-6*16 + 3*16)] // e.................................................................e............................................ - // ldr q2, [x4, #(-6*16 + 4*16)] // .........e........................................................|........e................................... - // ldr q6, [x4, #(-6*16 + 5*16)] // .......e..........................................................|......e..................................... - // sub v24.8h, v8.8h, v9.8h // .................e................................................|................e........................... - // add v8.8h, v8.8h, v9.8h // ................e.................................................|...............e............................ - // mul v9.8h, v24.8h, v1.8h // ......................e...........................................|.....................e...................... - // sqrdmulh v24.8h, v24.8h, v5.8h // .....................e............................................|....................e....................... - // mls v9.8h, v24.8h, v7.h[0] // ..............................e...................................|.............................e.............. - // sub v24.8h, v10.8h, v11.8h // .............e....................................................|............e............................... - // add v10.8h, v10.8h, v11.8h // ..............e...................................................|.............e.............................. - // mul v11.8h, v24.8h, v2.8h // ....................e.............................................|...................e........................ - // sqrdmulh v24.8h, v24.8h, v6.8h // .......................e..........................................|......................e..................... - // mls v11.8h, v24.8h, v7.h[0] // .............................e....................................|............................e............... - // sub v24.8h, v8.8h, v10.8h // ..........................e.......................................|.........................e.................. - // add v8.8h, v8.8h, v10.8h // ...............................e..................................|..............................e............. - // mul v10.8h, v24.8h, v0.8h // ............................e.....................................|...........................e................ - // sqrdmulh v24.8h, v24.8h, v4.8h // ...........................e......................................|..........................e................. - // mls v10.8h, v24.8h, v7.h[0] // ...................................e..............................|..................................e......... - // sub v24.8h, v9.8h, v11.8h // .................................e................................|................................e........... - // add v9.8h, v9.8h, v11.8h // ..................................e...............................|.................................e.......... - // mul v11.8h, v24.8h, v0.8h // ......................................e...........................|.....................................e...... - // sqrdmulh v24.8h, v24.8h, v4.8h // .......................................e..........................|......................................e..... - // mls v11.8h, v24.8h, v7.h[0] // .........................................e........................|........................................e... - // trn1 v25.4s, v8.4s, v9.4s // .....................................e............................|....................................e....... - // trn2 v26.4s, v8.4s, v9.4s // ....................................e.............................|...................................e........ - // trn1 v27.4s, v10.4s, v11.4s // ..............................................e...................|............................................ - // trn2 v28.4s, v10.4s, v11.4s // .............................................e....................|............................................ - // trn2 v10.2d, v25.2d, v27.2d // ................................................e.................|............................................ - // trn2 v11.2d, v26.2d, v28.2d // ..................................................e...............|............................................ - // trn1 v8.2d, v25.2d, v27.2d // ...............................................e..................|............................................ - // trn1 v9.2d, v26.2d, v28.2d // .................................................e................|............................................ - // ldr q0, [x3], #16 // ...................................................e..............|............................................ - // sub v24.8h, v8.8h, v9.8h // ...............................................................e..|............................................ - // add v8.8h, v8.8h, v9.8h // ....................................................e.............|............................................ - // mul v9.8h, v24.8h, v0.h[2] // ......*...........................................................|.....*...................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...*..............................................................|..*......................................... - // mls v9.8h, v24.8h, v7.h[0] // ...........*......................................................|..........*................................. - // sub v24.8h, v10.8h, v11.8h // ......................................................e...........|............................................ - // add v10.8h, v10.8h, v11.8h // .....................................................e............|............................................ - // mul v11.8h, v24.8h, v0.h[4] // .........................................................e........|............................................ - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..........................................................e.......|............................................ - // mls v11.8h, v24.8h, v7.h[0] // ..*...............................................................|.*.......................................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // .......................................................e..........|............................................ - // srshr v25.8h, v25.8h, #11 // ...........................................................e......|............................................ - // mls v8.8h, v25.8h, v7.h[0] // .............................................................e....|............................................ - // sqdmulh v25.8h, v10.8h, v7.h[1] // ........................................................e.........|............................................ - // srshr v25.8h, v25.8h, #11 // ............................................................e.....|............................................ - // mls v10.8h, v25.8h, v7.h[0] // ..............................................................e...|............................................ - // sub v24.8h, v8.8h, v10.8h // ....*.............................................................|...*........................................ - // add v8.8h, v8.8h, v10.8h // ................................................................e.|............................................ - // mul v10.8h, v24.8h, v0.h[0] // ........*.........................................................|.......*.................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........*.......................................................|.........*.................................. - // mls v10.8h, v24.8h, v7.h[0] // ..................*...............................................|.................*.......................... - // sub v24.8h, v9.8h, v11.8h // ...................*..............................................|..................*......................... - // add v9.8h, v9.8h, v11.8h // ........................................*.........................|.......................................*.... - // mul v11.8h, v24.8h, v0.h[0] // .........................*........................................|........................*................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................*.........................................|.......................*.................... - // mls v11.8h, v24.8h, v7.h[0] // ................................*.................................|...............................*............ - // str q8, [x1], #(64) // .................................................................e|............................................ - // str q9, [x1, #(-64 + 16*1)] // ..........................................*.......................|.........................................*.. - // str q10, [x1, #(-64 + 16*2)] // ...........................................*......................|..........................................*. - // str q11, [x1, #(-64 + 16*3)] // ............................................*.....................|...........................................* + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // e.......................................................................e................................................................ + // ldr q0, [x4], #(6*16) // ..............e.........................................................|.............e.................................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // .....e..................................................................|....e........................................................... + // ldr q1, [x4, #(-6*16 + 2*16)] // .............e..........................................................|............e................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // .......e................................................................|......e......................................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // .e......................................................................|e............................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ..........e.............................................................|.........e...................................................... + // sub v24.8h, v8.8h, v9.8h // ..................e.....................................................|.................e.............................................. + // add v8.8h, v8.8h, v9.8h // ...................e....................................................|..................e............................................. + // mul v9.8h, v24.8h, v1.8h // ........................e...............................................|.......................e........................................ + // sqrdmulh v24.8h, v24.8h, v5.8h // .......................e................................................|......................e......................................... + // mls v9.8h, v24.8h, v7.h[0] // ...............................e........................................|..............................e................................. + // sub v24.8h, v10.8h, v11.8h // ...............e........................................................|..............e................................................. + // add v10.8h, v10.8h, v11.8h // ................e.......................................................|...............e................................................ + // mul v11.8h, v24.8h, v2.8h // ......................e.................................................|.....................e.......................................... + // sqrdmulh v24.8h, v24.8h, v6.8h // .....................e..................................................|....................e........................................... + // mls v11.8h, v24.8h, v7.h[0] // ................................e.......................................|...............................e................................ + // sub v24.8h, v8.8h, v10.8h // ..........................e.............................................|.........................e...................................... + // add v8.8h, v8.8h, v10.8h // ...........................e............................................|..........................e..................................... + // mul v10.8h, v24.8h, v0.8h // .............................e..........................................|............................e................................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ..............................e.........................................|.............................e.................................. + // mls v10.8h, v24.8h, v7.h[0] // ....................................e...................................|...................................e............................ + // sub v24.8h, v9.8h, v11.8h // ..................................e.....................................|.................................e.............................. + // add v9.8h, v9.8h, v11.8h // ...................................e....................................|..................................e............................. + // mul v11.8h, v24.8h, v0.8h // ......................................e.................................|.....................................e.......................... + // sqrdmulh v24.8h, v24.8h, v4.8h // .......................................e................................|......................................e......................... + // mls v11.8h, v24.8h, v7.h[0] // ..........................................e.............................|.........................................e...................... + // trn1 v25.4s, v8.4s, v9.4s // .........................................e..............................|........................................e....................... + // trn2 v26.4s, v8.4s, v9.4s // .....................................e..................................|....................................e........................... + // trn1 v27.4s, v10.4s, v11.4s // ................................................e.......................|...............................................e................ + // trn2 v28.4s, v10.4s, v11.4s // ...............................................e........................|..............................................e................. + // trn2 v10.2d, v25.2d, v27.2d // ....................................................e...................|...................................................e............ + // trn2 v11.2d, v26.2d, v28.2d // ..................................................e.....................|.................................................e.............. + // trn1 v8.2d, v25.2d, v27.2d // .....................................................e..................|....................................................e........... + // trn1 v9.2d, v26.2d, v28.2d // ...................................................e....................|..................................................e............. + // ldr q0, [x3], #16 // .......................................................e................|......................................................e......... + // sub v24.8h, v8.8h, v9.8h // .............................................................e..........|............................................................e... + // add v8.8h, v8.8h, v9.8h // .........................................................e..............|........................................................e....... + // mul v9.8h, v24.8h, v0.h[2] // .................................................................e......|................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ....................................................................e...|................................................................ + // mls v9.8h, v24.8h, v7.h[0] // ....*...................................................................|...*............................................................ + // sub v24.8h, v10.8h, v11.8h // ......*.................................................................|.....*.......................................................... + // add v10.8h, v10.8h, v11.8h // ........................................................e...............|.......................................................e........ + // mul v11.8h, v24.8h, v0.h[4] // ...........*............................................................|..........*..................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ............*...........................................................|...........*.................................................... + // mls v11.8h, v24.8h, v7.h[0] // ....................*...................................................|...................*............................................ + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...........................................................e............|..........................................................e..... + // srshr v25.8h, v25.8h, #11 // ..............................................................e.........|.............................................................e.. + // mls v8.8h, v25.8h, v7.h[0] // ..................................................................e.....|................................................................ + // sqdmulh v25.8h, v9.8h, v7.h[1] // .........*..............................................................|........*....................................................... + // srshr v25.8h, v25.8h, #11 // .................*......................................................|................*............................................... + // mls v9.8h, v25.8h, v7.h[0] // .........................*..............................................|........................*....................................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ..........................................................e.............|.........................................................e...... + // srshr v25.8h, v25.8h, #11 // ...............................................................e........|..............................................................e. + // mls v10.8h, v25.8h, v7.h[0] // ...................................................................e....|................................................................ + // sqdmulh v25.8h, v11.8h, v7.h[1] // ............................*...........................................|...........................*.................................... + // srshr v25.8h, v25.8h, #11 // .................................*......................................|................................*............................... + // mls v11.8h, v25.8h, v7.h[0] // ........................................*...............................|.......................................*........................ + // sub v24.8h, v8.8h, v10.8h // ......................................................................e.|................................................................ + // add v8.8h, v8.8h, v10.8h // .....................................................................e..|................................................................ + // mul v10.8h, v24.8h, v0.h[0] // ...*....................................................................|..*............................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..*.....................................................................|.*.............................................................. + // mls v10.8h, v24.8h, v7.h[0] // ........*...............................................................|.......*........................................................ + // sub v24.8h, v9.8h, v11.8h // ............................................*...........................|...........................................*.................... + // add v9.8h, v9.8h, v11.8h // ...........................................*............................|..........................................*..................... + // mul v11.8h, v24.8h, v0.h[0] // ......................................................*.................|.....................................................*.......... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................*.........................|.............................................*.................. + // mls v11.8h, v24.8h, v7.h[0] // ............................................................*...........|...........................................................*.... + // str q8, [x1], #(64) // .......................................................................e|................................................................ + // str q9, [x1, #(-64 + 16*1)] // .............................................*..........................|............................................*................... + // str q10, [x1, #(-64 + 16*2)] // .................................................*......................|................................................*............... + // str q11, [x1, #(-64 + 16*3)] // ................................................................*.......|...............................................................* sub count, count, #1 cbnz count, layer4567_start - mul v20.8H, v23.8H, v3.H[2] // ...*............ - // gap // ................ - // gap // ................ - sqrdmulh v10.8H, v23.8H, v3.H[3] // .*.............. - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - mls v27.8H, v5.8H, v7.H[0] // *............... - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - sub v23.8H, v17.8H, v26.8H // ..*............. - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - mls v20.8H, v10.8H, v7.H[0] // ......*......... - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - mul v15.8H, v23.8H, v3.H[0] // ....*........... - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - add v13.8H, v20.8H, v27.8H // ............*... - sub v20.8H, v20.8H, v27.8H // ........*....... - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - sqrdmulh v2.8H, v23.8H, v3.H[1] // .....*.......... - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - mul v29.8H, v20.8H, v3.H[0] // ..........*..... - sqrdmulh v22.8H, v20.8H, v3.H[1] // .........*...... - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - mls v15.8H, v2.8H, v7.H[0] // .......*........ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - str q13, [x1, #-48] // .............*.. - mls v29.8H, v22.8H, v7.H[0] // ...........*.... - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - str q15, [x1, #-32] // ..............*. - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - str q29, [x1, #-16] // ...............* - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ - // gap // ................ + mls v31.8H, v10.8H, v7.H[0] // ..*................... + sub v4.8H, v26.8H, v5.8H // ...*.................. + mul v9.8H, v0.8H, v13.H[0] // .*.................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sqrdmulh v25.8H, v0.8H, v13.H[1] // *..................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mul v28.8H, v4.8H, v13.H[4] // ......*............... + sqrdmulh v16.8H, v4.8H, v13.H[5] // .......*.............. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sqdmulh v5.8H, v31.8H, v7.H[1] // .....*................ + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v9.8H, v25.8H, v7.H[0] // ....*................. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v28.8H, v16.8H, v7.H[0] // .........*............ + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + srshr v21.8H, v5.8H, #11 // ........*............. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + str q9, [x1, #-32] // ..................*... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sqdmulh v4.8H, v28.8H, v7.H[1] // ...........*.......... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v31.8H, v21.8H, v7.H[0] // ..........*........... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + srshr v14.8H, v4.8H, #11 // ............*......... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v28.8H, v14.8H, v7.H[0] // .............*........ + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sub v8.8H, v31.8H, v28.8H // ...............*...... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + add v26.8H, v31.8H, v28.8H // ..............*....... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sqrdmulh v20.8H, v8.8H, v13.H[1] // .................*.... + mul v13.8H, v8.8H, v13.H[0] // ...................*.. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + str q26, [x1, #-48] // ................*..... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v13.8H, v20.8H, v7.H[0] // ....................*. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + str q13, [x1, #-16] // .....................* + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... // original source code - // mls v27.8H, v5.8H, v7.H[0] // ..*............. - // sqrdmulh v5.8H, v23.8H, v3.H[3] // .*.............. - // sub v2.8H, v17.8H, v26.8H // ...*............ - // mul v20.8H, v23.8H, v3.H[2] // *............... - // mul v31.8H, v2.8H, v3.H[0] // .....*.......... - // sqrdmulh v9.8H, v2.8H, v3.H[1] // ........*....... - // mls v20.8H, v5.8H, v7.H[0] // ....*........... - // mls v31.8H, v9.8H, v7.H[0] // ...........*.... - // sub v2.8H, v20.8H, v27.8H // .......*........ - // sqrdmulh v29.8H, v2.8H, v3.H[1] // ..........*..... - // mul v8.8H, v2.8H, v3.H[0] // .........*...... - // mls v8.8H, v29.8H, v7.H[0] // .............*.. - // add v0.8H, v20.8H, v27.8H // ......*......... - // str q0, [x1, #-48] // ............*... - // str q31, [x1, #-32] // ..............*. - // str q8, [x1, #-16] // ...............* + // sqrdmulh v25.8H, v0.8H, v13.H[1] // ...*.................. + // mul v19.8H, v0.8H, v13.H[0] // ..*................... + // mls v31.8H, v10.8H, v7.H[0] // *..................... + // sub v8.8H, v26.8H, v5.8H // .*.................... + // mls v19.8H, v25.8H, v7.H[0] // .......*.............. + // sqdmulh v26.8H, v31.8H, v7.H[1] // ......*............... + // mul v3.8H, v8.8H, v13.H[4] // ....*................. + // sqrdmulh v30.8H, v8.8H, v13.H[5] // .....*................ + // srshr v1.8H, v26.8H, #11 // .........*............ + // mls v3.8H, v30.8H, v7.H[0] // ........*............. + // mls v31.8H, v1.8H, v7.H[0] // ............*......... + // sqdmulh v17.8H, v3.8H, v7.H[1] // ...........*.......... + // srshr v23.8H, v17.8H, #11 // .............*........ + // mls v3.8H, v23.8H, v7.H[0] // ..............*....... + // add v23.8H, v31.8H, v3.8H // ................*..... + // sub v8.8H, v31.8H, v3.8H // ...............*...... + // str q23, [x1, #-48] // ...................*.. + // sqrdmulh v23.8H, v8.8H, v13.H[1] // .................*.... + // str q19, [x1, #-32] // ..........*........... + // mul v22.8H, v8.8H, v13.H[0] // ..................*... + // mls v22.8H, v23.8H, v7.H[0] // ....................*. + // str q22, [x1, #-16] // .....................* // --------------------------------------------------------------------- @@ -1276,646 +1376,594 @@ layer4567_start: .p2align 2 - ldr q2, [x0, #320] // .*........................................ - ldr q16, [x0, #256] // ..*....................................... - ldr q23, [x0, #384] // *......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - ldr q21, [x0, #448] // .....*.................................... - ldr q26, [x0, #192] // ...*...................................... - ldr q20, [x0, #128] // ....*..................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - ldr q17, [x0, #0] // ......*................................... - ldr q4, [x0, #64] // .......*.................................. - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - sub v25.8H, v16.8H, v2.8H // .........*................................ - add v2.8H, v16.8H, v2.8H // ........*................................. - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - sub v16.8H, v23.8H, v21.8H // ..........*............................... - add v23.8H, v23.8H, v21.8H // ............*............................. - sub v21.8H, v20.8H, v26.8H // ...........*.............................. - add v26.8H, v20.8H, v26.8H // ...........................*.............. - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - sub v20.8H, v17.8H, v4.8H // .............*............................ - add v17.8H, v17.8H, v4.8H // ..............*........................... - sqrdmulh v4.8H, v25.8H, v1.H[3] // ...............*.......................... - mul v13.8H, v25.8H, v1.H[2] // ...................*...................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - mul v3.8H, v16.8H, v1.H[4] // .................*........................ - sqrdmulh v16.8H, v16.8H, v1.H[5] // ..................*....................... - sqrdmulh v11.8H, v21.8H, v1.H[1] // ................*......................... - mul v21.8H, v21.8H, v1.H[0] // .......................*.................. - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - sub v10.8H, v2.8H, v23.8H // ............................*............. - add v25.8H, v2.8H, v23.8H // .............................*............ - mul v19.8H, v20.8H, v0.H[6] // ....................*..................... - sqrdmulh v23.8H, v20.8H, v0.H[7] // .....................*.................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - add v28.8H, v17.8H, v26.8H // ...............................*.......... - sub v9.8H, v17.8H, v26.8H // ....................................*..... - mls v13.8H, v4.8H, v7.H[0] // ......................*................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - mls v3.8H, v16.8H, v7.H[0] // ........................*................. - mls v21.8H, v11.8H, v7.H[0] // ..........................*............... - sqdmulh v2.8H, v25.8H, v7.H[1] // ...................................*...... - sqrdmulh v22.8H, v10.8H, v0.H[5] // .....................................*.... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - mul v11.8H, v10.8H, v0.H[4] // .........................................* - mls v19.8H, v23.8H, v7.H[0] // .........................*................ - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - add v23.8H, v13.8H, v3.8H // ..............................*........... - sub v26.8H, v13.8H, v3.8H // .................................*........ - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - sub v16.8H, v19.8H, v21.8H // ..................................*....... - add v12.8H, v19.8H, v21.8H // ................................*......... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - sqrdmulh v18.8H, v16.8H, v0.H[3] // ......................................*... - mul v3.8H, v16.8H, v0.H[2] // .......................................*.. - add v20.8H, v12.8H, v23.8H // ........................................*. - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... + ldr q19, [x0, #256] // *................................................. + ldr q23, [x0, #192] // .*................................................ + ldr q22, [x0, #128] // ..*............................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + ldr q28, [x0, #320] // ...*.............................................. + ldr q27, [x0, #64] // ....*............................................. + ldr q24, [x0, #0] // .....*............................................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + ldr q3, [x0, #384] // ......*........................................... + ldr q26, [x0, #448] // .......*.......................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v20.8H, v22.8H, v23.8H // ........*......................................... + add v23.8H, v22.8H, v23.8H // .......................*.......................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v22.8H, v19.8H, v28.8H // ............*..................................... + add v19.8H, v19.8H, v28.8H // .........*........................................ + sub v28.8H, v24.8H, v27.8H // ...........*...................................... + add v27.8H, v24.8H, v27.8H // ..........*....................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v24.8H, v3.8H, v26.8H // .................*................................ + sub v3.8H, v3.8H, v26.8H // .............*.................................... + sqrdmulh v26.8H, v20.8H, v1.H[1] // ..............*................................... + mul v20.8H, v20.8H, v1.H[0] // ................*................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v11.8H, v22.8H, v1.H[3] // ....................*............................. + mul v22.8H, v22.8H, v1.H[2] // .....................*............................ + sqrdmulh v14.8H, v28.8H, v0.H[7] // ...............*.................................. + mul v25.8H, v28.8H, v0.H[6] // ......................*........................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v28.8H, v19.8H, v24.8H // .........................*........................ + add v19.8H, v19.8H, v24.8H // ........................*......................... + sqrdmulh v24.8H, v3.8H, v1.H[5] // ..................*............................... + mul v3.8H, v3.8H, v1.H[4] // ...................*.............................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v5.8H, v27.8H, v23.8H // .............................*.................... + add v23.8H, v27.8H, v23.8H // ..............................*................... + mls v20.8H, v26.8H, v7.H[0] // ..........................*....................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v25.8H, v14.8H, v7.H[0] // .................................*................ + mls v22.8H, v11.8H, v7.H[0] // ............................*..................... + mul v14.8H, v28.8H, v0.H[4] // ...............................*.................. + sqrdmulh v27.8H, v28.8H, v0.H[5] // ................................*................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v3.8H, v24.8H, v7.H[0] // ...........................*...................... + sqrdmulh v24.8H, v5.8H, v0.H[3] // ..................................*............... + mul v28.8H, v5.8H, v0.H[2] // ...................................*.............. + add v26.8H, v23.8H, v19.8H // .....................................*............ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v19.8H, v23.8H, v19.8H // ........................................*......... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v8.8H, v25.8H, v20.8H // .......................................*.......... + add v21.8H, v25.8H, v20.8H // .........................................*........ + mls v14.8H, v27.8H, v7.H[0] // ......................................*........... + mul v12.8H, v26.8H, v29.8H // ...........................................*...... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v10.8H, v22.8H, v3.8H // ....................................*............. + sub v11.8H, v22.8H, v3.8H // ..........................................*....... + mls v28.8H, v24.8H, v7.H[0] // .............................................*.... + sqrdmulh v13.8H, v26.8H, v30.8H // ............................................*..... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mul v22.8H, v19.8H, v0.H[0] // ..............................................*... + sqrdmulh v15.8H, v19.8H, v0.H[1] // ...............................................*.. + mul v17.8H, v8.8H, v0.H[2] // ................................................*. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v4.8H, v21.8H, v10.8H // .................................................* + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. // original source code - // ldr q6, [x0, #384] // ..*....................................... - // ldr q9, [x0, #320] // *......................................... - // ldr q21, [x0, #256] // .*........................................ - // ldr q8, [x0, #192] // ....*..................................... - // ldr q17, [x0, #128] // .....*.................................... - // ldr q13, [x0, #448] // ...*...................................... - // ldr q5, [x0, #0] // ......*................................... - // ldr q22, [x0, #64] // .......*.................................. - // add v24.8H, v21.8H, v9.8H // .........*................................ - // sub v26.8H, v21.8H, v9.8H // ........*................................. - // sub v9.8H, v6.8H, v13.8H // ..........*............................... - // sub v14.8H, v17.8H, v8.8H // ............*............................. - // add v27.8H, v6.8H, v13.8H // ...........*.............................. - // sub v31.8H, v5.8H, v22.8H // ..............*........................... - // add v6.8H, v5.8H, v22.8H // ...............*.......................... - // sqrdmulh v12.8H, v26.8H, v1.H[3] // ................*......................... - // sqrdmulh v13.8H, v14.8H, v1.H[1] // ....................*..................... - // mul v5.8H, v9.8H, v1.H[4] // ..................*....................... - // sqrdmulh v21.8H, v9.8H, v1.H[5] // ...................*...................... - // mul v9.8H, v26.8H, v1.H[2] // .................*........................ - // mul v2.8H, v31.8H, v0.H[6] // ........................*................. - // sqrdmulh v26.8H, v31.8H, v0.H[7] // .........................*................ - // mls v9.8H, v12.8H, v7.H[0] // ............................*............. - // mul v14.8H, v14.8H, v1.H[0] // .....................*.................... - // mls v5.8H, v21.8H, v7.H[0] // .............................*............ - // mls v2.8H, v26.8H, v7.H[0] // ..................................*....... - // mls v14.8H, v13.8H, v7.H[0] // ..............................*........... - // add v8.8H, v17.8H, v8.8H // .............*............................ - // sub v13.8H, v24.8H, v27.8H // ......................*................... - // add v25.8H, v24.8H, v27.8H // .......................*.................. - // add v23.8H, v9.8H, v5.8H // ...................................*...... - // add v28.8H, v6.8H, v8.8H // ..........................*............... - // add v12.8H, v2.8H, v14.8H // ......................................*... - // sub v26.8H, v9.8H, v5.8H // ....................................*..... - // sub v19.8H, v2.8H, v14.8H // .....................................*.... - // sqdmulh v2.8H, v25.8H, v7.H[1] // ...............................*.......... - // sub v9.8H, v6.8H, v8.8H // ...........................*.............. - // sqrdmulh v22.8H, v13.8H, v0.H[5] // ................................*......... - // sqrdmulh v18.8H, v19.8H, v0.H[3] // .......................................*.. - // mul v3.8H, v19.8H, v0.H[2] // ........................................*. - // add v20.8H, v12.8H, v23.8H // .........................................* - // mul v11.8H, v13.8H, v0.H[4] // .................................*........ + // ldr q16, [x0, #256] // *................................................. + // ldr q5, [x0, #192] // .*................................................ + // ldr q31, [x0, #128] // ..*............................................... + // ldr q18, [x0, #320] // ...*.............................................. + // ldr q20, [x0, #64] // ....*............................................. + // ldr q11, [x0, #0] // .....*............................................ + // ldr q8, [x0, #384] // ......*........................................... + // ldr q10, [x0, #448] // .......*.......................................... + // sub v14.8H, v31.8H, v5.8H // ........*......................................... + // add v24.8H, v16.8H, v18.8H // ...........*...................................... + // add v21.8H, v11.8H, v20.8H // .............*.................................... + // sub v20.8H, v11.8H, v20.8H // ............*..................................... + // sub v19.8H, v16.8H, v18.8H // ..........*....................................... + // sub v18.8H, v8.8H, v10.8H // ...............*.................................. + // sqrdmulh v16.8H, v14.8H, v1.H[1] // ................*................................. + // sqrdmulh v9.8H, v20.8H, v0.H[7] // ....................*............................. + // mul v13.8H, v14.8H, v1.H[0] // .................*................................ + // add v15.8H, v8.8H, v10.8H // ..............*................................... + // sqrdmulh v14.8H, v18.8H, v1.H[5] // ........................*......................... + // mul v12.8H, v18.8H, v1.H[4] // .........................*........................ + // sqrdmulh v10.8H, v19.8H, v1.H[3] // ..................*............................... + // mul v25.8H, v19.8H, v1.H[2] // ...................*.............................. + // mul v19.8H, v20.8H, v0.H[6] // .....................*............................ + // add v20.8H, v31.8H, v5.8H // .........*........................................ + // add v2.8H, v24.8H, v15.8H // .......................*.......................... + // sub v28.8H, v24.8H, v15.8H // ......................*........................... + // mls v13.8H, v16.8H, v7.H[0] // ............................*..................... + // mls v12.8H, v14.8H, v7.H[0] // .................................*................ + // mls v25.8H, v10.8H, v7.H[0] // ..............................*................... + // sub v18.8H, v21.8H, v20.8H // ..........................*....................... + // add v11.8H, v21.8H, v20.8H // ...........................*...................... + // mul v14.8H, v28.8H, v0.H[4] // ...............................*.................. + // sqrdmulh v20.8H, v28.8H, v0.H[5] // ................................*................. + // mls v19.8H, v9.8H, v7.H[0] // .............................*.................... + // sqrdmulh v17.8H, v18.8H, v0.H[3] // ..................................*............... + // mul v28.8H, v18.8H, v0.H[2] // ...................................*.............. + // add v10.8H, v25.8H, v12.8H // ..........................................*....... + // add v24.8H, v11.8H, v2.8H // ....................................*............. + // mls v14.8H, v20.8H, v7.H[0] // ........................................*......... + // sub v8.8H, v19.8H, v13.8H // ......................................*........... + // sub v27.8H, v11.8H, v2.8H // .....................................*............ + // add v21.8H, v19.8H, v13.8H // .......................................*.......... + // sub v11.8H, v25.8H, v12.8H // ...........................................*...... + // mul v12.8H, v24.8H, v29.8H // .........................................*........ + // sqrdmulh v13.8H, v24.8H, v30.8H // .............................................*.... + // mls v28.8H, v17.8H, v7.H[0] // ............................................*..... + // mul v22.8H, v27.8H, v0.H[0] // ..............................................*... + // sqrdmulh v15.8H, v27.8H, v0.H[1] // ...............................................*.. + // mul v17.8H, v8.8H, v0.H[2] // ................................................*. + // sub v4.8H, v21.8H, v10.8H // .................................................* sub count, count, #1 layer123_start: - sub v27.8H, v12.8H, v23.8H // ...........................................................*.................................. - sqrdmulh v16.8H, v9.8H, v0.H[3] // ...............................*.............................................................. - sqdmulh v24.8H, v28.8H, v7.H[1] // ................................................*............................................. - mul v4.8H, v9.8H, v0.H[2] // ..............................*............................................................... - ldr q6, [x0, #400] // ......e....................................................................................... - ldr q9, [x0, #336] // .....e........................................................................................ - // gap // .............................................................................................. - ldr q21, [x0, #272] // ....e......................................................................................... - mul v10.8H, v20.8H, v29.8H // .................................................................................*............ - sqrdmulh v23.8H, v20.8H, v30.8H // ..................................................................................*........... - mul v20.8H, v26.8H, v0.H[4] // .............................................*................................................ - ldr q8, [x0, #208] // ...e.......................................................................................... - ldr q17, [x0, #144] // ..e........................................................................................... - ldr q13, [x0, #464] // .......e...................................................................................... - // gap // .............................................................................................. - srshr v12.8H, v2.8H, #11 // ....................................................*......................................... - sqrdmulh v19.8H, v27.8H, v0.H[1] // ..............................................................*............................... - ldr q5, [x0, #16] // e............................................................................................. - sqrdmulh v31.8H, v26.8H, v0.H[5] // ..............................................*............................................... - mls v11.8H, v22.8H, v7.H[0] // ..........................................*................................................... - ldr q22, [x0, #80] // .e............................................................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v15.8H, v27.8H, v0.H[0] // .............................................................*................................ - mls v4.8H, v16.8H, v7.H[0] // ................................*............................................................. - mls v3.8H, v18.8H, v7.H[0] // .....................................*........................................................ - // gap // .............................................................................................. - srshr v16.8H, v24.8H, #11 // .................................................*............................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v24.8H, v21.8H, v9.8H // ...................e.......................................................................... - sub v26.8H, v21.8H, v9.8H // ..................e........................................................................... - mls v25.8H, v12.8H, v7.H[0] // .....................................................*........................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v10.8H, v23.8H, v7.H[0] // ...................................................................................*.......... - sub v9.8H, v6.8H, v13.8H // .......................e...................................................................... - mls v20.8H, v31.8H, v7.H[0] // ...............................................*.............................................. - sub v14.8H, v17.8H, v8.8H // .............e................................................................................ - add v27.8H, v6.8H, v13.8H // ........................e..................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v18.8H, v4.8H, v11.8H // .................................................................*............................ - sub v31.8H, v5.8H, v22.8H // ........e..................................................................................... - add v6.8H, v5.8H, v22.8H // .........e.................................................................................... - sqrdmulh v12.8H, v26.8H, v1.H[3] // .....................e........................................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v13.8H, v14.8H, v1.H[1] // ................e............................................................................. - mul v5.8H, v9.8H, v1.H[4] // .........................e.................................................................... - str q10, [x0, #64] // ...........................................................................................*.. - sqrdmulh v21.8H, v9.8H, v1.H[5] // ..........................e................................................................... - mul v9.8H, v26.8H, v1.H[2] // ....................e......................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v22.8H, v3.8H, v20.8H // .....................................................................*........................ - add v20.8H, v3.8H, v20.8H // ......................................................................*....................... - sub v11.8H, v4.8H, v11.8H // ................................................................*............................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v28.8H, v16.8H, v7.H[0] // ..................................................*........................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v2.8H, v31.8H, v0.H[6] // ..........e................................................................................... - mul v23.8H, v18.8H, v29.8H // ....................................................................................*......... - sqrdmulh v16.8H, v18.8H, v30.8H // .....................................................................................*........ - sqrdmulh v26.8H, v31.8H, v0.H[7] // ...........e.................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v10.8H, v20.8H, v30.8H // ........................................................................................*..... - mul v3.8H, v20.8H, v29.8H // .......................................................................................*...... - sqrdmulh v4.8H, v11.8H, v0.H[1] // ...................................................................*.......................... - mls v9.8H, v12.8H, v7.H[0] // ......................e....................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v18.8H, v22.8H, v0.H[1] // ........................................................................*..................... - add v12.8H, v28.8H, v25.8H // .......................................................*...................................... - sub v25.8H, v28.8H, v25.8H // ......................................................*....................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v14.8H, v14.8H, v1.H[0] // ...............e.............................................................................. - mls v5.8H, v21.8H, v7.H[0] // ...........................e.................................................................. - // gap // .............................................................................................. - mul v20.8H, v22.8H, v0.H[0] // .......................................................................*...................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v22.8H, v11.8H, v0.H[0] // ..................................................................*........................... - mls v23.8H, v16.8H, v7.H[0] // ......................................................................................*....... - mul v11.8H, v12.8H, v29.8H // ..............................................................................*............... - sqrdmulh v31.8H, v25.8H, v0.H[1] // .........................................................*.................................... - mul v16.8H, v25.8H, v0.H[0] // ........................................................*..................................... - sqrdmulh v12.8H, v12.8H, v30.8H // ...............................................................................*.............. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v15.8H, v19.8H, v7.H[0] // ...............................................................*.............................. - mls v3.8H, v10.8H, v7.H[0] // .........................................................................................*.... - mls v2.8H, v26.8H, v7.H[0] // ............e................................................................................. - mls v14.8H, v13.8H, v7.H[0] // .................e............................................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v22.8H, v4.8H, v7.H[0] // ....................................................................*......................... - add v8.8H, v17.8H, v8.8H // ..............e............................................................................... - str q23, [x0, #128] // ............................................................................................*. - mls v20.8H, v18.8H, v7.H[0] // .........................................................................*.................... - sub v13.8H, v24.8H, v27.8H // ......................................e....................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v16.8H, v31.8H, v7.H[0] // ..........................................................*................................... - mls v11.8H, v12.8H, v7.H[0] // ................................................................................*............. - add v25.8H, v24.8H, v27.8H // .......................................e...................................................... - add v23.8H, v9.8H, v5.8H // ............................................e................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v28.8H, v6.8H, v8.8H // .............................e................................................................ - str q3, [x0, #192] // .............................................................................................* - add v12.8H, v2.8H, v14.8H // ..................................e........................................................... - sub v26.8H, v9.8H, v5.8H // ...........................................e.................................................. - str q15, [x0, #320] // ...........................................................................*.................. - sub v19.8H, v2.8H, v14.8H // .................................e............................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - str q20, [x0, #448] // .............................................................................*................ - sqdmulh v2.8H, v25.8H, v7.H[1] // ...................................................e.......................................... - sub v9.8H, v6.8H, v8.8H // ............................e................................................................. - str q22, [x0, #384] // ............................................................................*................. - sqrdmulh v22.8H, v13.8H, v0.H[5] // .........................................e.................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v18.8H, v19.8H, v0.H[3] // ....................................e......................................................... - mul v3.8H, v19.8H, v0.H[2] // ...................................e.......................................................... - add v20.8H, v12.8H, v23.8H // ............................................................e................................. - str q16, [x0, #256] // ..........................................................................*................... - str q11, [x0], #(16) // ..........................................................................................*... - mul v11.8H, v13.8H, v0.H[4] // ........................................e..................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. + ldr q16, [x0, #272] // ....e................................................................................... + // gap // ........................................................................................ + mul v25.8H, v11.8H, v0.H[4] // .............................................*.......................................... + sqrdmulh v3.8H, v11.8H, v0.H[5] // ..............................................*......................................... + ldr q5, [x0, #208] // ...e.................................................................................... + sqrdmulh v19.8H, v8.8H, v0.H[3] // ....................................*................................................... + ldr q31, [x0, #144] // ..e..................................................................................... + add v27.8H, v21.8H, v10.8H // ......................................................*................................. + mul v9.8H, v4.8H, v0.H[0] // .......................................................*................................ + sqrdmulh v24.8H, v4.8H, v0.H[1] // ........................................................*............................... + add v23.8H, v28.8H, v14.8H // ...........................................................*............................ + ldr q18, [x0, #336] // .....e.................................................................................. + ldr q20, [x0, #80] // .e...................................................................................... + ldr q11, [x0, #16] // e....................................................................................... + // gap // ........................................................................................ + mls v12.8H, v13.8H, v7.H[0] // ..........................................................................*............. + sub v4.8H, v28.8H, v14.8H // ..........................................................*............................. + mls v22.8H, v15.8H, v7.H[0] // ....................................................*................................... + ldr q8, [x0, #400] // ......e................................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v2.8H, v27.8H, v30.8H // ............................................................................*........... + ldr q10, [x0, #464] // .......e................................................................................ + mul v27.8H, v27.8H, v29.8H // ...........................................................................*............ + // gap // ........................................................................................ + mls v25.8H, v3.8H, v7.H[0] // ...............................................*........................................ + mul v3.8H, v23.8H, v29.8H // ..............................................................................*......... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v6.8H, v23.8H, v30.8H // ...............................................................................*........ + // gap // ........................................................................................ + mls v17.8H, v19.8H, v7.H[0] // .....................................*.................................................. + mls v9.8H, v24.8H, v7.H[0] // .........................................................*.............................. + // gap // ........................................................................................ + str q12, [x0], #(16) // ....................................................................................*... + sqrdmulh v26.8H, v4.8H, v0.H[1] // .............................................................*.......................... + sub v14.8H, v31.8H, v5.8H // .............e.......................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v4.8H, v4.8H, v0.H[0] // ............................................................*........................... + add v24.8H, v16.8H, v18.8H // ...................e.................................................................... + str q22, [x0, #240] // ....................................................................*................... + add v21.8H, v11.8H, v20.8H // .........e.............................................................................. + sub v20.8H, v11.8H, v20.8H // ........e............................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v27.8H, v2.8H, v7.H[0] // .............................................................................*.......... + sub v19.8H, v16.8H, v18.8H // ..................e..................................................................... + sub v11.8H, v17.8H, v25.8H // ...............................................................*........................ + sub v18.8H, v8.8H, v10.8H // .......................e................................................................ + add v28.8H, v17.8H, v25.8H // ................................................................*....................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v16.8H, v14.8H, v1.H[1] // ................e....................................................................... + str q9, [x0, #304] // .....................................................................*.................. + sqrdmulh v9.8H, v20.8H, v0.H[7] // ...........e............................................................................ + mul v13.8H, v14.8H, v1.H[0] // ...............e........................................................................ + add v15.8H, v8.8H, v10.8H // ........................e............................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v14.8H, v18.8H, v1.H[5] // ..........................e............................................................. + mul v12.8H, v18.8H, v1.H[4] // .........................e.............................................................. + sqrdmulh v10.8H, v19.8H, v1.H[3] // .....................e.................................................................. + mul v25.8H, v19.8H, v1.H[2] // ....................e................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v23.8H, v28.8H, v29.8H // .................................................................................*...... + mul v19.8H, v20.8H, v0.H[6] // ..........e............................................................................. + sqrdmulh v22.8H, v11.8H, v0.H[1] // ..................................................................*..................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v20.8H, v31.8H, v5.8H // ..............e......................................................................... + add v2.8H, v24.8H, v15.8H // .......................................e................................................ + sqrdmulh v31.8H, v28.8H, v30.8H // ..................................................................................*..... + sub v28.8H, v24.8H, v15.8H // ......................................e................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v13.8H, v16.8H, v7.H[0] // .................e...................................................................... + mls v12.8H, v14.8H, v7.H[0] // ...........................e............................................................ + mls v25.8H, v10.8H, v7.H[0] // ......................e................................................................. + sub v18.8H, v21.8H, v20.8H // ............................e........................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v5.8H, v11.8H, v0.H[0] // .................................................................*...................... + add v11.8H, v21.8H, v20.8H // .............................e.......................................................... + mul v14.8H, v28.8H, v0.H[4] // ........................................e............................................... + sqrdmulh v20.8H, v28.8H, v0.H[5] // .........................................e.............................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v19.8H, v9.8H, v7.H[0] // ............e........................................................................... + mls v23.8H, v31.8H, v7.H[0] // ...................................................................................*.... + // gap // ........................................................................................ + sqrdmulh v17.8H, v18.8H, v0.H[3] // ...............................e........................................................ + mul v28.8H, v18.8H, v0.H[2] // ..............................e......................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v4.8H, v26.8H, v7.H[0] // ..............................................................*......................... + // gap // ........................................................................................ + mls v3.8H, v6.8H, v7.H[0] // ................................................................................*....... + add v10.8H, v25.8H, v12.8H // ............................................e........................................... + add v24.8H, v11.8H, v2.8H // .................................................e...................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v5.8H, v22.8H, v7.H[0] // ...................................................................*.................... + mls v14.8H, v20.8H, v7.H[0] // ..........................................e............................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + str q27, [x0, #48] // .....................................................................................*.. + // gap // ........................................................................................ + sub v8.8H, v19.8H, v13.8H // .................................e...................................................... + sub v27.8H, v11.8H, v2.8H // ................................................e....................................... + add v21.8H, v19.8H, v13.8H // ..................................e..................................................... + sub v11.8H, v25.8H, v12.8H // ...........................................e............................................ + mul v12.8H, v24.8H, v29.8H // ........................................................................e............... + // gap // ........................................................................................ + sqrdmulh v13.8H, v24.8H, v30.8H // .........................................................................e.............. + mls v28.8H, v17.8H, v7.H[0] // ................................e....................................................... + str q4, [x0, #368] // ......................................................................*................. + // gap // ........................................................................................ + str q23, [x0, #176] // .......................................................................................* + str q3, [x0, #112] // ......................................................................................*. + // gap // ........................................................................................ + mul v22.8H, v27.8H, v0.H[0] // ..................................................e..................................... + sqrdmulh v15.8H, v27.8H, v0.H[1] // ...................................................e.................................... + mul v17.8H, v8.8H, v0.H[2] // ...................................e.................................................... + // gap // ........................................................................................ + str q5, [x0, #432] // .......................................................................*................ + sub v4.8H, v21.8H, v10.8H // .....................................................e.................................. // original source code - // ldr q8, [x0, #0] // ...........e..............................................................................|..............e............................................................................. - // ldr q9, [x0, #(1*(512/8))] // ..............e...........................................................................|.................e.......................................................................... - // ldr q10, [x0, #(2*(512/8))] // .......e..................................................................................|..........e................................................................................. - // ldr q11, [x0, #(3*(512/8))] // ......e...................................................................................|.........e.................................................................................. - // ldr q12, [x0, #(4*(512/8))] // ..e.......................................................................................|.....e...................................................................................... - // ldr q13, [x0, #(5*(512/8))] // .e........................................................................................|....e....................................................................................... - // ldr q14, [x0, #(6*(512/8))] // e.........................................................................................|...e........................................................................................ - // ldr q15, [x0, #(7*(512/8))] // ........e.................................................................................|...........e................................................................................ - // sub v24.8h, v8.8h, v9.8h // ............................e.............................................................|...............................e............................................................ - // add v8.8h, v8.8h, v9.8h // .............................e............................................................|................................e........................................................... - // mul v9.8h, v24.8h, v0.h[6] // ........................................e.................................................|...........................................e................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[7] // ...........................................e..............................................|..............................................e............................................. - // mls v9.8h, v24.8h, v7.h[0] // ..............................................................e...........................|.................................................................e.......................... - // sub v24.8h, v10.8h, v11.8h // .........................e................................................................|............................e............................................................... - // add v10.8h, v10.8h, v11.8h // .................................................................e........................|....................................................................e....................... - // mul v11.8h, v24.8h, v1.h[0] // ...................................................e......................................|......................................................e..................................... - // sqrdmulh v24.8h, v24.8h, v1.h[1] // ...............................e..........................................................|..................................e......................................................... - // mls v11.8h, v24.8h, v7.h[0] // ...............................................................e..........................|..................................................................e......................... - // sub v24.8h, v12.8h, v13.8h // ....................e.....................................................................|.......................e.................................................................... - // add v12.8h, v12.8h, v13.8h // ...................e......................................................................|......................e..................................................................... - // mul v13.8h, v24.8h, v1.h[2] // ...................................e......................................................|......................................e..................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[3] // ..............................e...........................................................|.................................e.......................................................... - // mls v13.8h, v24.8h, v7.h[0] // ...............................................e..........................................|..................................................e......................................... - // sub v24.8h, v14.8h, v15.8h // .......................e..................................................................|..........................e................................................................. - // add v14.8h, v14.8h, v15.8h // ..........................e...............................................................|.............................e.............................................................. - // mul v15.8h, v24.8h, v1.h[4] // ................................e.........................................................|...................................e........................................................ - // sqrdmulh v24.8h, v24.8h, v1.h[5] // ..................................e.......................................................|.....................................e...................................................... - // mls v15.8h, v24.8h, v7.h[0] // ....................................................e.....................................|.......................................................e.................................... - // sub v24.8h, v8.8h, v10.8h // .................................................................................e........|....................................................................................e....... - // add v8.8h, v8.8h, v10.8h // .........................................................................e................|............................................................................e............... - // mul v10.8h, v24.8h, v0.h[2] // ..........................................................................................|..*......................................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..........................................................................................|*........................................................................................... - // mls v10.8h, v24.8h, v7.h[0] // ................*.........................................................................|...................*........................................................................ - // sub v24.8h, v9.8h, v11.8h // ..............................................................................e...........|.................................................................................e.......... - // add v9.8h, v9.8h, v11.8h // ...........................................................................e..............|..............................................................................e............. - // mul v11.8h, v24.8h, v0.h[2] // .....................................................................................e....|........................................................................................e... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ....................................................................................e.....|.......................................................................................e.... - // mls v11.8h, v24.8h, v7.h[0] // .................*........................................................................|....................*....................................................................... - // sub v24.8h, v12.8h, v14.8h // ....................................................................e.....................|.......................................................................e.................... - // add v12.8h, v12.8h, v14.8h // .......................................................................e..................|..........................................................................e................. - // mul v14.8h, v24.8h, v0.h[4] // .........................................................................................e|............................................................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...................................................................................e......|......................................................................................e..... - // mls v14.8h, v24.8h, v7.h[0] // .............*............................................................................|................*........................................................................... - // sub v24.8h, v13.8h, v15.8h // ............................................................................e.............|...............................................................................e............ - // add v13.8h, v13.8h, v15.8h // ........................................................................e.................|...........................................................................e................ - // mul v15.8h, v24.8h, v0.h[4] // .....*....................................................................................|........*................................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ............*.............................................................................|...............*............................................................................ - // mls v15.8h, v24.8h, v7.h[0] // ........................*.................................................................|...........................*................................................................ - // sqdmulh v25.8h, v8.8h, v7.h[1] // ..........................................................................................|.*.......................................................................................... - // srshr v25.8h, v25.8h, #11 // ..................*.......................................................................|.....................*...................................................................... - // mls v8.8h, v25.8h, v7.h[0] // .......................................*..................................................|..........................................*................................................. - // sqdmulh v25.8h, v12.8h, v7.h[1] // ................................................................................e.........|...................................................................................e........ - // srshr v25.8h, v25.8h, #11 // .........*................................................................................|............*............................................................................... - // mls v12.8h, v25.8h, v7.h[0] // .....................*....................................................................|........................*................................................................... - // sub v24.8h, v8.8h, v12.8h // ..................................................*.......................................|.....................................................*...................................... - // add v8.8h, v8.8h, v12.8h // .................................................*........................................|....................................................*....................................... - // mul v12.8h, v24.8h, v0.h[0] // ..........................................................*...............................|.............................................................*.............................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .........................................................*................................|............................................................*............................... - // mls v12.8h, v24.8h, v7.h[0] // .....................................................................*....................|........................................................................*................... - // sub v24.8h, v9.8h, v13.8h // ..........................................................................................*............................................................................................ - // add v9.8h, v9.8h, v13.8h // ......................................................................................e...|.........................................................................................e.. - // mul v13.8h, v24.8h, v0.h[0] // ...............*..........................................................................|..................*......................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........*...............................................................................|.............*.............................................................................. - // mls v13.8h, v24.8h, v7.h[0] // ............................................................*.............................|...............................................................*............................ - // sub v24.8h, v10.8h, v14.8h // ......................................*...................................................|.........................................*.................................................. - // add v10.8h, v10.8h, v14.8h // ...........................*..............................................................|..............................*............................................................. - // mul v14.8h, v24.8h, v0.h[0] // ......................................................*...................................|.........................................................*.................................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................*...........................................|.................................................*.......................................... - // mls v14.8h, v24.8h, v7.h[0] // ................................................................*.........................|...................................................................*........................ - // sub v24.8h, v11.8h, v15.8h // ....................................*.....................................................|.......................................*.................................................... - // add v11.8h, v11.8h, v15.8h // .....................................*....................................................|........................................*................................................... - // mul v15.8h, v24.8h, v0.h[0] // .....................................................*....................................|........................................................*................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ................................................*.........................................|...................................................*........................................ - // mls v15.8h, v24.8h, v7.h[0] // ...................................................................*......................|......................................................................*..................... - // str q12, [x0, #(4*(512/8))] // .......................................................................................*..|..........................................................................................*. - // str q13, [x0, #(5*(512/8))] // .............................................................................*............|................................................................................*........... - // str q14, [x0, #(6*(512/8))] // ..................................................................................*.......|.....................................................................................*...... - // str q15, [x0, #(7*(512/8))] // ...............................................................................*..........|..................................................................................*......... - // mul v12.8h, v8.8h, v29.8h // ........................................................*.................................|...........................................................*................................ - // sqrdmulh v8.8h, v8.8h, v30.8h // ...........................................................*..............................|..............................................................*............................. - // mls v12.8h, v8.8h, v7.h[0] // ......................................................................*...................|.........................................................................*.................. - // mul v13.8h, v9.8h, v29.8h // ...*......................................................................................|......*..................................................................................... - // sqrdmulh v9.8h, v9.8h, v30.8h // ....*.....................................................................................|.......*.................................................................................... - // mls v13.8h, v9.8h, v7.h[0] // ......................*...................................................................|.........................*.................................................................. - // mul v14.8h, v10.8h, v29.8h // .........................................*................................................|............................................*............................................... - // sqrdmulh v10.8h, v10.8h, v30.8h // ..........................................*...............................................|.............................................*.............................................. - // mls v14.8h, v10.8h, v7.h[0] // .......................................................*..................................|..........................................................*................................. - // mul v15.8h, v11.8h, v29.8h // .............................................*............................................|................................................*........................................... - // sqrdmulh v11.8h, v11.8h, v30.8h // ............................................*.............................................|...............................................*............................................ - // mls v15.8h, v11.8h, v7.h[0] // .............................................................*............................|................................................................*........................... - // str q12, [x0], #(16) // ........................................................................................*.|...........................................................................................* - // str q13, [x0, #(-16 + 1*(512/8))] // .................................*........................................................|....................................*....................................................... - // str q14, [x0, #(-16 + 2*(512/8))] // ..................................................................*.......................|.....................................................................*...................... - // str q15, [x0, #(-16 + 3*(512/8))] // ..........................................................................*...............|.............................................................................*.............. + // ldr q8, [x0, #0] // ............e...........................................................................|...........e.......................................................................... + // ldr q9, [x0, #(1*(512/8))] // ...........e............................................................................|..........e........................................................................... + // ldr q10, [x0, #(2*(512/8))] // .....e..................................................................................|....e................................................................................. + // ldr q11, [x0, #(3*(512/8))] // ...e....................................................................................|..e................................................................................... + // ldr q12, [x0, #(4*(512/8))] // e.......................................................................................e...................................................................................... + // ldr q13, [x0, #(5*(512/8))] // ..........e.............................................................................|.........e............................................................................ + // ldr q14, [x0, #(6*(512/8))] // ................e.......................................................................|...............e...................................................................... + // ldr q15, [x0, #(7*(512/8))] // ..................e.....................................................................|.................e.................................................................... + // sub v24.8h, v8.8h, v9.8h // ................................e.......................................................|...............................e...................................................... + // add v8.8h, v8.8h, v9.8h // ...............................e........................................................|..............................e....................................................... + // mul v9.8h, v24.8h, v0.h[6] // ................................................e.......................................|...............................................e...................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // ........................................e...............................................|.......................................e.............................................. + // mls v9.8h, v24.8h, v7.h[0] // ..............................................................e.........................|.............................................................e........................ + // sub v24.8h, v10.8h, v11.8h // ...........................e............................................................|..........................e........................................................... + // add v10.8h, v10.8h, v11.8h // ..................................................e.....................................|.................................................e.................................... + // mul v11.8h, v24.8h, v1.h[0] // .........................................e..............................................|........................................e............................................. + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ......................................e.................................................|.....................................e................................................ + // mls v11.8h, v24.8h, v7.h[0] // ......................................................e.................................|.....................................................e................................ + // sub v24.8h, v12.8h, v13.8h // ..................................e.....................................................|.................................e.................................................... + // add v12.8h, v12.8h, v13.8h // .............................e..........................................................|............................e......................................................... + // mul v13.8h, v24.8h, v1.h[2] // ..............................................e.........................................|.............................................e........................................ + // sqrdmulh v24.8h, v24.8h, v1.h[3] // .............................................e..........................................|............................................e......................................... + // mls v13.8h, v24.8h, v7.h[0] // ........................................................e...............................|.......................................................e.............................. + // sub v24.8h, v14.8h, v15.8h // ....................................e...................................................|...................................e.................................................. + // add v14.8h, v14.8h, v15.8h // ..........................................e.............................................|.........................................e............................................ + // mul v15.8h, v24.8h, v1.h[4] // ............................................e...........................................|...........................................e.......................................... + // sqrdmulh v24.8h, v24.8h, v1.h[5] // ...........................................e............................................|..........................................e........................................... + // mls v15.8h, v24.8h, v7.h[0] // .......................................................e................................|......................................................e............................... + // sub v24.8h, v8.8h, v10.8h // .........................................................e..............................|........................................................e............................. + // add v8.8h, v8.8h, v10.8h // ...........................................................e............................|..........................................................e........................... + // mul v10.8h, v24.8h, v0.h[2] // .................................................................e......................|................................................................e..................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ................................................................e.......................|...............................................................e...................... + // mls v10.8h, v24.8h, v7.h[0] // ...............................................................................e........|..............................................................................e....... + // sub v24.8h, v9.8h, v11.8h // .........................................................................e..............|........................................................................e............. + // add v9.8h, v9.8h, v11.8h // ...........................................................................e............|..........................................................................e........... + // mul v11.8h, v24.8h, v0.h[2] // .....................................................................................e..|....................................................................................e. + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ....*...................................................................................|...*.................................................................................. + // mls v11.8h, v24.8h, v7.h[0] // .......................*................................................................|......................*............................................................... + // sub v24.8h, v12.8h, v14.8h // .....................................................e..................................|....................................................e................................. + // add v12.8h, v12.8h, v14.8h // ...................................................e....................................|..................................................e................................... + // mul v14.8h, v24.8h, v0.h[4] // ............................................................e...........................|...........................................................e.......................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .............................................................e..........................|............................................................e......................... + // mls v14.8h, v24.8h, v7.h[0] // .......................................................................e................|......................................................................e............... + // sub v24.8h, v13.8h, v15.8h // ............................................................................e...........|...........................................................................e.......... + // add v13.8h, v13.8h, v15.8h // ....................................................................e...................|...................................................................e.................. + // mul v15.8h, v24.8h, v0.h[4] // .*......................................................................................|*..................................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..*.....................................................................................|.*.................................................................................... + // mls v15.8h, v24.8h, v7.h[0] // ....................*...................................................................|...................*.................................................................. + // sub v24.8h, v8.8h, v12.8h // ..........................................................................e.............|.........................................................................e............ + // add v8.8h, v8.8h, v12.8h // .....................................................................e..................|....................................................................e................. + // mul v12.8h, v24.8h, v0.h[0] // ...................................................................................e....|..................................................................................e... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ....................................................................................e...|...................................................................................e.. + // mls v12.8h, v24.8h, v7.h[0] // ...............*........................................................................|..............*....................................................................... + // sub v24.8h, v9.8h, v13.8h // .......................................................................................e|...................................................................................... + // add v9.8h, v9.8h, v13.8h // ......*.................................................................................|.....*................................................................................ + // mul v13.8h, v24.8h, v0.h[0] // .......*................................................................................|......*............................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........*...............................................................................|.......*.............................................................................. + // mls v13.8h, v24.8h, v7.h[0] // ........................*...............................................................|.......................*.............................................................. + // sub v24.8h, v10.8h, v14.8h // ..............*.........................................................................|.............*........................................................................ + // add v10.8h, v10.8h, v14.8h // .........*..............................................................................|........*............................................................................. + // mul v14.8h, v24.8h, v0.h[0] // ............................*...........................................................|...........................*.......................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........................*.............................................................|.........................*............................................................ + // mls v14.8h, v24.8h, v7.h[0] // ..................................................................*.....................|.................................................................*.................... + // sub v24.8h, v11.8h, v15.8h // ...................................*....................................................|..................................*................................................... + // add v11.8h, v11.8h, v15.8h // .....................................*..................................................|....................................*................................................. + // mul v15.8h, v24.8h, v0.h[0] // ..........................................................*.............................|.........................................................*............................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .................................................*......................................|................................................*..................................... + // mls v15.8h, v24.8h, v7.h[0] // ......................................................................*.................|.....................................................................*................ + // str q12, [x0, #(4*(512/8))] // ..............................*.........................................................|.............................*........................................................ + // str q13, [x0, #(5*(512/8))] // .......................................*................................................|......................................*............................................... + // str q14, [x0, #(6*(512/8))] // ................................................................................*.......|...............................................................................*...... + // str q15, [x0, #(7*(512/8))] // ......................................................................................*.|.....................................................................................* + // mul v12.8h, v8.8h, v29.8h // .............................................................................e..........|............................................................................e......... + // sqrdmulh v8.8h, v8.8h, v30.8h // ..............................................................................e.........|.............................................................................e........ + // mls v12.8h, v8.8h, v7.h[0] // .............*..........................................................................|............*......................................................................... + // mul v13.8h, v9.8h, v29.8h // ...................*....................................................................|..................*................................................................... + // sqrdmulh v9.8h, v9.8h, v30.8h // .................*......................................................................|................*..................................................................... + // mls v13.8h, v9.8h, v7.h[0] // .................................*......................................................|................................*..................................................... + // mul v14.8h, v10.8h, v29.8h // .....................*..................................................................|....................*................................................................. + // sqrdmulh v10.8h, v10.8h, v30.8h // ......................*.................................................................|.....................*................................................................ + // mls v14.8h, v10.8h, v7.h[0] // ...................................................................*....................|..................................................................*................... + // mul v15.8h, v11.8h, v29.8h // ...............................................*........................................|..............................................*....................................... + // sqrdmulh v11.8h, v11.8h, v30.8h // ....................................................*...................................|...................................................*.................................. + // mls v15.8h, v11.8h, v7.h[0] // ...............................................................*........................|..............................................................*....................... + // str q12, [x0], #(16) // .........................*..............................................................|........................*............................................................. + // str q13, [x0, #(-16 + 1*(512/8))] // ........................................................................*...............|.......................................................................*.............. + // str q14, [x0, #(-16 + 2*(512/8))] // ..................................................................................*.....|.................................................................................*.... + // str q15, [x0, #(-16 + 3*(512/8))] // .................................................................................*......|................................................................................*..... sub count, count, #1 cbnz count, layer123_start - mul v24.8H, v20.8H, v29.8H // ....*............................................... - sqrdmulh v19.8H, v26.8H, v0.H[5] // .........*.......................................... - mul v27.8H, v26.8H, v0.H[4] // ......*............................................. - sqdmulh v31.8H, v28.8H, v7.H[1] // ..*................................................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sqrdmulh v17.8H, v20.8H, v30.8H // .....*.............................................. - srshr v20.8H, v2.8H, #11 // .......*............................................ - sqrdmulh v16.8H, v9.8H, v0.H[3] // .*.................................................. - mul v8.8H, v9.8H, v0.H[2] // ...*................................................ - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v3.8H, v18.8H, v7.H[0] // .............*...................................... - sub v23.8H, v12.8H, v23.8H // *................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - srshr v9.8H, v31.8H, #11 // ..............*..................................... - mls v11.8H, v22.8H, v7.H[0] // ..........*......................................... - mls v27.8H, v19.8H, v7.H[0] // .................*.................................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v25.8H, v20.8H, v7.H[0] // ...............*.................................... - mls v8.8H, v16.8H, v7.H[0] // ............*....................................... - sqrdmulh v14.8H, v23.8H, v0.H[1] // ........*........................................... - mul v10.8H, v23.8H, v0.H[0] // ...........*........................................ - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v24.8H, v17.8H, v7.H[0] // ................*................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v23.8H, v3.8H, v27.8H // ....................*............................... - add v31.8H, v3.8H, v27.8H // .....................*.............................. - mls v28.8H, v9.8H, v7.H[0] // .......................*............................ - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v10.8H, v14.8H, v7.H[0] // .......................................*............ - sub v2.8H, v8.8H, v11.8H // ......................*............................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sqrdmulh v13.8H, v31.8H, v30.8H // ..........................*......................... - mul v20.8H, v31.8H, v29.8H // ...........................*........................ - add v31.8H, v8.8H, v11.8H // ..................*................................. - str q24, [x0, #64] // ...................*................................ - mul v16.8H, v23.8H, v0.H[0] // ................................*................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v12.8H, v28.8H, v25.8H // ...............................*.................... - add v21.8H, v28.8H, v25.8H // ..............................*..................... - sqrdmulh v3.8H, v2.8H, v0.H[1] // ............................*....................... - sqrdmulh v11.8H, v23.8H, v0.H[1] // .............................*...................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sqrdmulh v24.8H, v31.8H, v30.8H // .........................*.......................... - mul v17.8H, v31.8H, v29.8H // ........................*........................... - str q10, [x0, #320] // ...............................................*.... - mul v10.8H, v2.8H, v0.H[0] // .................................*.................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mul v5.8H, v21.8H, v29.8H // ...................................*................ - sqrdmulh v2.8H, v21.8H, v30.8H // ......................................*............. - sqrdmulh v23.8H, v12.8H, v0.H[1] // ....................................*............... - mul v12.8H, v12.8H, v0.H[0] // .....................................*.............. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v20.8H, v13.8H, v7.H[0] // ........................................*........... - mls v16.8H, v11.8H, v7.H[0] // ...........................................*........ - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v10.8H, v3.8H, v7.H[0] // .........................................*.......... - mls v17.8H, v24.8H, v7.H[0] // ..................................*................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v5.8H, v2.8H, v7.H[0] // .............................................*...... - mls v12.8H, v23.8H, v7.H[0] // ............................................*....... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - str q20, [x0, #192] // ..............................................*..... - str q16, [x0, #448] // ................................................*... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - str q17, [x0, #128] // ..........................................*......... - str q10, [x0, #384] // .................................................*.. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - str q5, [x0], #(16) // ...................................................* - str q12, [x0, #240] // ..................................................*. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... + mul v6.8H, v11.8H, v0.H[4] // *..................................... + sqrdmulh v27.8H, v11.8H, v0.H[5] // .*.................................... + add v11.8H, v21.8H, v10.8H // ...*.................................. + sqrdmulh v3.8H, v8.8H, v0.H[3] // ..*................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v25.8H, v4.8H, v0.H[0] // ....*................................. + sqrdmulh v24.8H, v4.8H, v0.H[1] // .....*................................ + add v26.8H, v28.8H, v14.8H // ......*............................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v16.8H, v28.8H, v14.8H // ........*............................. + sqrdmulh v23.8H, v11.8H, v30.8H // ..........*........................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v6.8H, v27.8H, v7.H[0] // ............*......................... + mls v17.8H, v3.8H, v7.H[0] // ...............*...................... + mul v18.8H, v26.8H, v29.8H // .............*........................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v22.8H, v15.8H, v7.H[0] // .........*............................ + sqrdmulh v26.8H, v26.8H, v30.8H // ..............*....................... + mul v4.8H, v11.8H, v29.8H // ...........*.......................... + mls v25.8H, v24.8H, v7.H[0] // ................*..................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v12.8H, v13.8H, v7.H[0] // .......*.............................. + sqrdmulh v9.8H, v16.8H, v0.H[1] // ..................*................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v19.8H, v17.8H, v6.8H // ......................*............... + add v21.8H, v17.8H, v6.8H // .......................*.............. + mul v3.8H, v16.8H, v0.H[0] // ...................*.................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q22, [x0, #256] // ....................*................. + mls v18.8H, v26.8H, v7.H[0] // ...............................*...... + mls v4.8H, v23.8H, v7.H[0] // .....................*................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v11.8H, v19.8H, v0.H[0] // ............................*......... + sqrdmulh v19.8H, v19.8H, v0.H[1] // ..........................*........... + mul v23.8H, v21.8H, v29.8H // .........................*............ + sqrdmulh v10.8H, v21.8H, v30.8H // ...........................*.......... + str q25, [x0, #320] // ........................*............. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v3.8H, v9.8H, v7.H[0] // ..............................*....... + str q12, [x0], #(16) // .................*.................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q4, [x0, #48] // .................................*.... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q18, [x0, #112] // ....................................*. + mls v11.8H, v19.8H, v7.H[0] // ................................*..... + mls v23.8H, v10.8H, v7.H[0] // .............................*........ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q3, [x0, #368] // ..................................*... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q11, [x0, #432] // .....................................* + str q23, [x0, #176] // ...................................*.. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... // original source code - // sub v27.8H, v12.8H, v23.8H // .........*.......................................... - // sqrdmulh v16.8H, v9.8H, v0.H[3] // ......*............................................. - // sqdmulh v24.8H, v28.8H, v7.H[1] // ...*................................................ - // mul v4.8H, v9.8H, v0.H[2] // .......*............................................ - // mul v10.8H, v20.8H, v29.8H // *................................................... - // sqrdmulh v23.8H, v20.8H, v30.8H // ....*............................................... - // mul v20.8H, v26.8H, v0.H[4] // ..*................................................. - // srshr v12.8H, v2.8H, #11 // .....*.............................................. - // sqrdmulh v19.8H, v27.8H, v0.H[1] // ...............*.................................... - // sqrdmulh v31.8H, v26.8H, v0.H[5] // .*.................................................. - // mls v11.8H, v22.8H, v7.H[0] // ...........*........................................ - // mul v15.8H, v27.8H, v0.H[0] // ................*................................... - // mls v4.8H, v16.8H, v7.H[0] // ..............*..................................... - // mls v3.8H, v18.8H, v7.H[0] // ........*........................................... - // srshr v16.8H, v24.8H, #11 // ..........*......................................... - // mls v25.8H, v12.8H, v7.H[0] // .............*...................................... - // mls v10.8H, v23.8H, v7.H[0] // .................*.................................. - // mls v20.8H, v31.8H, v7.H[0] // ............*....................................... - // add v18.8H, v4.8H, v11.8H // .........................*.......................... - // str q10, [x0, #64] // ..........................*......................... - // sub v22.8H, v3.8H, v20.8H // ..................*................................. - // add v20.8H, v3.8H, v20.8H // ...................*................................ - // sub v11.8H, v4.8H, v11.8H // ......................*............................. - // mls v28.8H, v16.8H, v7.H[0] // ....................*............................... - // mul v23.8H, v18.8H, v29.8H // .................................*.................. - // sqrdmulh v16.8H, v18.8H, v30.8H // ................................*................... - // sqrdmulh v10.8H, v20.8H, v30.8H // .......................*............................ - // mul v3.8H, v20.8H, v29.8H // ........................*........................... - // sqrdmulh v4.8H, v11.8H, v0.H[1] // ..............................*..................... - // sqrdmulh v18.8H, v22.8H, v0.H[1] // ...............................*.................... - // add v12.8H, v28.8H, v25.8H // .............................*...................... - // sub v25.8H, v28.8H, v25.8H // ............................*....................... - // mul v20.8H, v22.8H, v0.H[0] // ...........................*........................ - // mul v22.8H, v11.8H, v0.H[0] // ...................................*................ - // mls v23.8H, v16.8H, v7.H[0] // ...........................................*........ - // mul v11.8H, v12.8H, v29.8H // ....................................*............... - // sqrdmulh v31.8H, v25.8H, v0.H[1] // ......................................*............. - // mul v16.8H, v25.8H, v0.H[0] // .......................................*............ - // sqrdmulh v12.8H, v12.8H, v30.8H // .....................................*.............. - // mls v15.8H, v19.8H, v7.H[0] // .....................*.............................. - // mls v3.8H, v10.8H, v7.H[0] // ........................................*........... - // mls v22.8H, v4.8H, v7.H[0] // ..........................................*......... - // str q23, [x0, #128] // ................................................*... - // mls v20.8H, v18.8H, v7.H[0] // .........................................*.......... - // mls v16.8H, v31.8H, v7.H[0] // .............................................*...... - // mls v11.8H, v12.8H, v7.H[0] // ............................................*....... - // str q3, [x0, #192] // ..............................................*..... - // str q15, [x0, #320] // ..................................*................. - // str q20, [x0, #448] // ...............................................*.... - // str q22, [x0, #384] // .................................................*.. - // str q16, [x0, #256] // ...................................................* - // str q11, [x0], #(16) // ..................................................*. + // mul v25.8H, v11.8H, v0.H[4] // *..................................... + // sqrdmulh v3.8H, v11.8H, v0.H[5] // .*.................................... + // sqrdmulh v19.8H, v8.8H, v0.H[3] // ...*.................................. + // add v27.8H, v21.8H, v10.8H // ..*................................... + // mul v9.8H, v4.8H, v0.H[0] // ....*................................. + // sqrdmulh v24.8H, v4.8H, v0.H[1] // .....*................................ + // add v23.8H, v28.8H, v14.8H // ......*............................... + // mls v12.8H, v13.8H, v7.H[0] // ................*..................... + // sub v4.8H, v28.8H, v14.8H // .......*.............................. + // mls v22.8H, v15.8H, v7.H[0] // ............*......................... + // sqrdmulh v2.8H, v27.8H, v30.8H // ........*............................. + // mul v27.8H, v27.8H, v29.8H // ..............*....................... + // mls v25.8H, v3.8H, v7.H[0] // .........*............................ + // mul v3.8H, v23.8H, v29.8H // ...........*.......................... + // sqrdmulh v6.8H, v23.8H, v30.8H // .............*........................ + // mls v17.8H, v19.8H, v7.H[0] // ..........*........................... + // mls v9.8H, v24.8H, v7.H[0] // ...............*...................... + // str q12, [x0], #(16) // ..............................*....... + // sqrdmulh v26.8H, v4.8H, v0.H[1] // .................*.................... + // mul v4.8H, v4.8H, v0.H[0] // ....................*................. + // str q22, [x0, #240] // .....................*................ + // mls v27.8H, v2.8H, v7.H[0] // .......................*.............. + // sub v11.8H, v17.8H, v25.8H // ..................*................... + // add v28.8H, v17.8H, v25.8H // ...................*.................. + // str q9, [x0, #304] // ............................*......... + // mul v23.8H, v28.8H, v29.8H // ..........................*........... + // sqrdmulh v22.8H, v11.8H, v0.H[1] // .........................*............ + // sqrdmulh v31.8H, v28.8H, v30.8H // ...........................*.......... + // mul v5.8H, v11.8H, v0.H[0] // ........................*............. + // mls v23.8H, v31.8H, v7.H[0] // ..................................*... + // mls v4.8H, v26.8H, v7.H[0] // .............................*........ + // mls v3.8H, v6.8H, v7.H[0] // ......................*............... + // mls v5.8H, v22.8H, v7.H[0] // .................................*.... + // str q27, [x0, #48] // ...............................*...... + // str q4, [x0, #368] // ...................................*.. + // str q23, [x0, #176] // .....................................* + // str q3, [x0, #112] // ................................*..... + // str q5, [x0, #432] // ....................................*. pop_stack diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s index 9129d2f..7973747 100644 --- a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s @@ -354,550 +354,582 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_icestorm: mov count, #8 .p2align 2 - ldr q27, [x4, #16] // ........*....................................... - ldr q6, [x3], #16 // .............................................*.. - ld4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x1] // *............................................... - // gap // ................................................ - ldr q9, [x4], #(6*16) // .*.............................................. - ldr q10, [x4, #-32] // ...*............................................ - // gap // ................................................ - // gap // ................................................ - ldr q24, [x4, #-48] // ..*............................................. - ldr q16, [x4, #-64] // ....*........................................... - // gap // ................................................ - // gap // ................................................ - ldr q13, [x4, #-16] // .....*.......................................... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - sub v21.8H, v1.8H, v2.8H // .......*........................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - sub v26.8H, v3.8H, v4.8H // .........*...................................... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - mul v8.8H, v21.8H, v16.8H // ...........*.................................... - sqrdmulh v22.8H, v21.8H, v24.8H // ..........*..................................... - // gap // ................................................ - // gap // ................................................ - sqrdmulh v13.8H, v26.8H, v13.8H // .............*.................................. - mul v14.8H, v26.8H, v10.8H // ............*................................... - // gap // ................................................ - // gap // ................................................ - add v26.8H, v1.8H, v2.8H // ......*......................................... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - mls v8.8H, v22.8H, v7.H[0] // ...............*................................ - add v20.8H, v3.8H, v4.8H // ..............*................................. - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - mls v14.8H, v13.8H, v7.H[0] // ................*............................... - add v30.8H, v26.8H, v20.8H // ..................*............................. - sub v18.8H, v26.8H, v20.8H // .................*.............................. - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - add v16.8H, v8.8H, v14.8H // ........................*....................... - sub v8.8H, v8.8H, v14.8H // ....................*........................... - // gap // ................................................ - // gap // ................................................ - mul v11.8H, v18.8H, v9.8H // .....................*.......................... - sqrdmulh v31.8H, v18.8H, v27.8H // ...................*............................ - // gap // ................................................ - // gap // ................................................ - mul v19.8H, v8.8H, v9.8H // ......................*......................... - sqrdmulh v29.8H, v8.8H, v27.8H // .......................*........................ - // gap // ................................................ - // gap // ................................................ - trn1 v9.4S, v30.4S, v16.4S // ...........................*.................... - trn2 v18.4S, v30.4S, v16.4S // ............................*................... - // gap // ................................................ - // gap // ................................................ - mls v11.8H, v31.8H, v7.H[0] // .........................*...................... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - mls v19.8H, v29.8H, v7.H[0] // ..........................*..................... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - trn2 v21.4S, v11.4S, v19.4S // .............................*.................. - trn1 v15.4S, v11.4S, v19.4S // ..............................*................. - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - trn1 v25.2D, v18.2D, v21.2D // .................................*.............. - trn1 v27.2D, v9.2D, v15.2D // ...............................*................ - // gap // ................................................ - // gap // ................................................ - trn2 v1.2D, v18.2D, v21.2D // ..................................*............. - trn2 v14.2D, v9.2D, v15.2D // ................................*............... - // gap // ................................................ - // gap // ................................................ - add v29.8H, v27.8H, v25.8H // ...................................*............ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - add v23.8H, v14.8H, v1.8H // ....................................*........... - sub v12.8H, v14.8H, v1.8H // ..........................................*..... - // gap // ................................................ - // gap // ................................................ - sqdmulh v28.8H, v29.8H, v7.H[1] // ......................................*......... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - sqdmulh v30.8H, v23.8H, v7.H[1] // .......................................*........ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - srshr v5.8H, v28.8H, #11 // .........................................*...... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - srshr v9.8H, v30.8H, #11 // ........................................*....... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - mls v29.8H, v5.8H, v7.H[0] // ...........................................*.... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - mls v23.8H, v9.8H, v7.H[0] // ............................................*... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - add v26.8H, v29.8H, v23.8H // ..............................................*. - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - str q26, [x1], #(64) // ...............................................* - sub v26.8H, v27.8H, v25.8H // .....................................*.......... - // gap // ................................................ - // gap // ................................................ + ld4 {v27.4S, v28.4S, v29.4S, v30.4S}, [x1] // *..................................................... + ldr q5, [x3], #16 // .........................................*............ + // gap // ...................................................... + // gap // ...................................................... + ldr q17, [x4, #64] // ..*................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + ldr q26, [x4, #80] // ...*.................................................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + ldr q1, [x4, #32] // .....*................................................ + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + ldr q15, [x4, #48] // ....*................................................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + ldr q23, [x4], #(6*16) // ........*............................................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + add v16.8H, v29.8H, v30.8H // ......*............................................... + sub v25.8H, v29.8H, v30.8H // .......*.............................................. + ldr q19, [x4, #-80] // .*.................................................... + // gap // ...................................................... + sub v21.8H, v27.8H, v28.8H // .........*............................................ + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v17.8H, v25.8H, v17.8H // ...........*.......................................... + sqrdmulh v26.8H, v25.8H, v26.8H // ..........*........................................... + // gap // ...................................................... + // gap // ...................................................... + mul v6.8H, v21.8H, v1.8H // .............*........................................ + sqrdmulh v14.8H, v21.8H, v15.8H // ............*......................................... + // gap // ...................................................... + // gap // ...................................................... + add v11.8H, v27.8H, v28.8H // ..............*....................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v17.8H, v26.8H, v7.H[0] // ...............*...................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v6.8H, v14.8H, v7.H[0] // ................*..................................... + sub v30.8H, v11.8H, v16.8H // .................*.................................... + // gap // ...................................................... + // gap // ...................................................... + add v29.8H, v11.8H, v16.8H // ..................*................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v4.8H, v30.8H, v23.8H // ....................*................................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sub v15.8H, v6.8H, v17.8H // .....................*................................ + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v9.8H, v30.8H, v19.8H // ...................*.................................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v8.8H, v15.8H, v23.8H // .......................*.............................. + sqrdmulh v24.8H, v15.8H, v19.8H // ......................*............................... + // gap // ...................................................... + // gap // ...................................................... + add v26.8H, v6.8H, v17.8H // ........................*............................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v4.8H, v9.8H, v7.H[0] // .........................*............................ + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v8.8H, v24.8H, v7.H[0] // ..........................*........................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + trn1 v16.4S, v29.4S, v26.4S // ...........................*.......................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + trn2 v17.4S, v29.4S, v26.4S // ............................*......................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + trn1 v14.4S, v4.4S, v8.4S // .............................*........................ + trn2 v21.4S, v4.4S, v8.4S // ..............................*....................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + trn2 v28.2D, v17.2D, v21.2D // ...............................*...................... + trn2 v31.2D, v16.2D, v14.2D // ................................*..................... + // gap // ...................................................... + // gap // ...................................................... + trn1 v27.2D, v16.2D, v14.2D // .................................*.................... + trn1 v11.2D, v17.2D, v21.2D // ..................................*................... + // gap // ...................................................... + // gap // ...................................................... + add v4.8H, v31.8H, v28.8H // ...................................*.................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + add v24.8H, v27.8H, v11.8H // ....................................*................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sqdmulh v20.8H, v4.8H, v7.H[1] // .......................................*.............. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sub v3.8H, v31.8H, v28.8H // .....................................*................ + sqdmulh v8.8H, v24.8H, v7.H[1] // ........................................*............. + // gap // ...................................................... + // gap // ...................................................... + sub v13.8H, v27.8H, v11.8H // ......................................*............... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + srshr v2.8H, v20.8H, #11 // ..........................................*........... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + srshr v19.8H, v8.8H, #11 // ...........................................*.......... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v10.8H, v13.8H, v5.H[3] // ..............................................*....... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v4.8H, v2.8H, v7.H[0] // ............................................*......... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v24.8H, v19.8H, v7.H[0] // .............................................*........ + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v28.8H, v3.8H, v5.H[4] // ...............................................*...... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v0.8H, v13.8H, v5.H[2] // .................................................*.... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + add v9.8H, v24.8H, v4.8H // ..................................................*... + sub v21.8H, v24.8H, v4.8H // ...................................................*.. + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v19.8H, v3.8H, v5.H[5] // ................................................*..... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v0.8H, v10.8H, v7.H[0] // ....................................................*. + str q9, [x1], #(64) // .....................................................* + // gap // ...................................................... + // gap // ...................................................... // original source code - // ld4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x1] // ..*............................................. - // ldr q5, [x4], #(6*16) // ...*............................................ - // ldr q23, [x4, #-48] // .....*.......................................... - // ldr q20, [x4, #-32] // ....*........................................... - // ldr q8, [x4, #-64] // ......*......................................... - // ldr q10, [x4, #-16] // .......*........................................ - // add v21.8H, v0.8H, v1.8H // ..............*................................. - // sub v0.8H, v0.8H, v1.8H // ........*....................................... - // ldr q15, [x4, #-80] // *............................................... - // sub v4.8H, v2.8H, v3.8H // .........*...................................... - // sqrdmulh v13.8H, v0.8H, v23.8H // ...........*.................................... - // mul v1.8H, v0.8H, v8.8H // ..........*..................................... - // mul v28.8H, v4.8H, v20.8H // .............*.................................. - // sqrdmulh v19.8H, v4.8H, v10.8H // ............*................................... - // add v0.8H, v2.8H, v3.8H // ................*............................... - // mls v1.8H, v13.8H, v7.H[0] // ...............*................................ - // mls v28.8H, v19.8H, v7.H[0] // .................*.............................. - // sub v8.8H, v21.8H, v0.8H // ...................*............................ - // add v25.8H, v21.8H, v0.8H // ..................*............................. - // sqrdmulh v16.8H, v8.8H, v15.8H // .......................*........................ - // sub v22.8H, v1.8H, v28.8H // .....................*.......................... - // mul v4.8H, v8.8H, v5.8H // ......................*......................... - // mul v21.8H, v22.8H, v5.8H // ........................*....................... - // sqrdmulh v24.8H, v22.8H, v15.8H // .........................*...................... - // add v15.8H, v1.8H, v28.8H // ....................*........................... - // mls v4.8H, v16.8H, v7.H[0] // ............................*................... - // mls v21.8H, v24.8H, v7.H[0] // .............................*.................. - // trn1 v30.4S, v25.4S, v15.4S // ..........................*..................... - // trn2 v9.4S, v25.4S, v15.4S // ...........................*.................... - // trn2 v16.4S, v4.4S, v21.4S // ..............................*................. - // trn1 v21.4S, v4.4S, v21.4S // ...............................*................ - // trn1 v8.2D, v30.2D, v21.2D // .................................*.............. - // trn2 v30.2D, v30.2D, v21.2D // ...................................*............ - // trn1 v25.2D, v9.2D, v16.2D // ................................*............... - // trn2 v9.2D, v9.2D, v16.2D // ..................................*............. - // add v29.8H, v8.8H, v25.8H // ....................................*........... - // add v23.8H, v30.8H, v9.8H // .....................................*.......... - // sub v26.8H, v8.8H, v25.8H // ...............................................* - // sqdmulh v8.8H, v29.8H, v7.H[1] // .......................................*........ - // sqdmulh v21.8H, v23.8H, v7.H[1] // ........................................*....... - // srshr v4.8H, v21.8H, #11 // ..........................................*..... - // srshr v21.8H, v8.8H, #11 // .........................................*...... - // sub v12.8H, v30.8H, v9.8H // ......................................*......... - // mls v29.8H, v21.8H, v7.H[0] // ...........................................*.... - // mls v23.8H, v4.8H, v7.H[0] // ............................................*... - // ldr q6, [x3], #16 // .*.............................................. - // add v2.8H, v29.8H, v23.8H // .............................................*.. - // str q2, [x1], #(64) // ..............................................*. + // ld4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x1] // *..................................................... + // ldr q30, [x4, #16] // .........*............................................ + // ldr q1, [x4, #64] // ..*................................................... + // ldr q14, [x4, #80] // ...*.................................................. + // ldr q8, [x4, #48] // .....*................................................ + // ldr q13, [x4, #32] // ....*................................................. + // add v22.8H, v17.8H, v18.8H // .......*.............................................. + // sub v6.8H, v17.8H, v18.8H // ........*............................................. + // ldr q29, [x4], #(6*16) // ......*............................................... + // sub v26.8H, v15.8H, v16.8H // ..........*........................................... + // sqrdmulh v31.8H, v6.8H, v14.8H // ............*......................................... + // mul v24.8H, v6.8H, v1.8H // ...........*.......................................... + // sqrdmulh v20.8H, v26.8H, v8.8H // ..............*....................................... + // mul v26.8H, v26.8H, v13.8H // .............*........................................ + // add v8.8H, v15.8H, v16.8H // ...............*...................................... + // mls v24.8H, v31.8H, v7.H[0] // ................*..................................... + // mls v26.8H, v20.8H, v7.H[0] // .................*.................................... + // sub v27.8H, v8.8H, v22.8H // ..................*................................... + // add v3.8H, v8.8H, v22.8H // ...................*.................................. + // sqrdmulh v19.8H, v27.8H, v30.8H // ......................*............................... + // mul v16.8H, v27.8H, v29.8H // ....................*................................. + // sub v2.8H, v26.8H, v24.8H // .....................*................................ + // sqrdmulh v14.8H, v2.8H, v30.8H // ........................*............................. + // mul v22.8H, v2.8H, v29.8H // .......................*.............................. + // add v2.8H, v26.8H, v24.8H // .........................*............................ + // mls v16.8H, v19.8H, v7.H[0] // ..........................*........................... + // mls v22.8H, v14.8H, v7.H[0] // ...........................*.......................... + // trn1 v23.4S, v3.4S, v2.4S // ............................*......................... + // trn2 v19.4S, v3.4S, v2.4S // .............................*........................ + // trn1 v20.4S, v16.4S, v22.4S // ..............................*....................... + // trn2 v13.4S, v16.4S, v22.4S // ...............................*...................... + // trn2 v27.2D, v19.2D, v13.2D // ................................*..................... + // trn2 v22.2D, v23.2D, v20.2D // .................................*.................... + // trn1 v26.2D, v23.2D, v20.2D // ..................................*................... + // trn1 v24.2D, v19.2D, v13.2D // ...................................*.................. + // add v23.8H, v22.8H, v27.8H // ....................................*................. + // add v10.8H, v26.8H, v24.8H // .....................................*................ + // sub v0.8H, v22.8H, v27.8H // .......................................*.............. + // sub v20.8H, v26.8H, v24.8H // .........................................*............ + // sqdmulh v28.8H, v23.8H, v7.H[1] // ......................................*............... + // sqdmulh v22.8H, v10.8H, v7.H[1] // ........................................*............. + // ldr q5, [x3], #16 // .*.................................................... + // srshr v28.8H, v28.8H, #11 // ..........................................*........... + // srshr v22.8H, v22.8H, #11 // ...........................................*.......... + // mls v23.8H, v28.8H, v7.H[0] // .............................................*........ + // mls v10.8H, v22.8H, v7.H[0] // ..............................................*....... + // sqrdmulh v30.8H, v20.8H, v5.H[3] // ............................................*......... + // mul v28.8H, v0.8H, v5.H[4] // ...............................................*...... + // sqrdmulh v19.8H, v0.8H, v5.H[5] // ...................................................*.. + // mul v0.8H, v20.8H, v5.H[2] // ................................................*..... + // add v25.8H, v10.8H, v23.8H // .................................................*.... + // sub v21.8H, v10.8H, v23.8H // ..................................................*... + // mls v0.8H, v30.8H, v7.H[0] // ....................................................*. + // str q25, [x1], #(64) // .....................................................* sub count, count, #1 layer4567_start: - ld4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x1] // e................................................................. - ldr q5, [x4], #(6*16) // .e................................................................ - mul v27.8H, v12.8H, v6.H[4] // ...........................................*...................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sqrdmulh v31.8H, v12.8H, v6.H[5] // ............................................*..................... - sub v18.8H, v29.8H, v23.8H // ....................................................*............. - // gap // .................................................................. - // gap // .................................................................. - ldr q23, [x4, #-48] // ....e............................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - ldr q20, [x4, #-32] // .....e............................................................ - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - ldr q8, [x4, #-64] // ...e.............................................................. - mls v27.8H, v31.8H, v7.H[0] // .............................................*.................... - ldr q10, [x4, #-16] // ......e........................................................... - // gap // .................................................................. - // gap // .................................................................. - add v21.8H, v0.8H, v1.8H // ........e......................................................... - sub v0.8H, v0.8H, v1.8H // .......e.......................................................... - ldr q15, [x4, #-80] // ..e............................................................... - // gap // .................................................................. - sub v4.8H, v2.8H, v3.8H // ............e..................................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sqrdmulh v13.8H, v0.8H, v23.8H // ..........e....................................................... - mul v1.8H, v0.8H, v8.8H // .........e........................................................ - // gap // .................................................................. - // gap // .................................................................. - mul v28.8H, v4.8H, v20.8H // ..............e................................................... - sqrdmulh v19.8H, v4.8H, v10.8H // ...............e.................................................. - // gap // .................................................................. - // gap // .................................................................. - add v0.8H, v2.8H, v3.8H // .............e.................................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mls v1.8H, v13.8H, v7.H[0] // ...........e...................................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mls v28.8H, v19.8H, v7.H[0] // ................e................................................. - // gap // .................................................................. - // gap // .................................................................. - sub v8.8H, v21.8H, v0.8H // .................e................................................ - add v25.8H, v21.8H, v0.8H // ..................e............................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sqrdmulh v16.8H, v8.8H, v15.8H // ....................e............................................. - // gap // .................................................................. - // gap // .................................................................. - mul v14.8H, v18.8H, v6.H[0] // ......................................................*........... - sub v22.8H, v1.8H, v28.8H // ......................e........................................... - mul v4.8H, v8.8H, v5.8H // ...................e.............................................. - sqrdmulh v8.8H, v26.8H, v6.H[3] // .......................................*.......................... - // gap // .................................................................. - // gap // .................................................................. - mul v21.8H, v22.8H, v5.8H // ........................e......................................... - sqrdmulh v24.8H, v22.8H, v15.8H // .........................e........................................ - // gap // .................................................................. - // gap // .................................................................. - add v15.8H, v1.8H, v28.8H // .......................e.......................................... - mul v17.8H, v26.8H, v6.H[2] // ......................................*........................... - // gap // .................................................................. - // gap // .................................................................. - sqrdmulh v28.8H, v18.8H, v6.H[1] // .......................................................*.......... - mls v4.8H, v16.8H, v7.H[0] // .....................e............................................ - // gap // .................................................................. - // gap // .................................................................. - mls v21.8H, v24.8H, v7.H[0] // ..........................e....................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mls v17.8H, v8.8H, v7.H[0] // ........................................*......................... - // gap // .................................................................. - // gap // .................................................................. - trn1 v30.4S, v25.4S, v15.4S // ...........................e...................................... - trn2 v9.4S, v25.4S, v15.4S // ............................e..................................... - mls v14.8H, v28.8H, v7.H[0] // ........................................................*......... - // gap // .................................................................. - // gap // .................................................................. - trn2 v16.4S, v4.4S, v21.4S // ..............................e................................... - // gap // .................................................................. - // gap // .................................................................. - trn1 v21.4S, v4.4S, v21.4S // .............................e.................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sub v19.8H, v17.8H, v27.8H // .........................................................*........ - // gap // .................................................................. - trn1 v8.2D, v30.2D, v21.2D // .................................e................................ - trn2 v30.2D, v30.2D, v21.2D // ...............................e.................................. - // gap // .................................................................. - trn1 v25.2D, v9.2D, v16.2D // ..................................e............................... - trn2 v9.2D, v9.2D, v16.2D // ................................e................................. - // gap // .................................................................. - // gap // .................................................................. - mul v15.8H, v19.8H, v6.H[0] // ...........................................................*...... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - add v29.8H, v8.8H, v25.8H // .....................................e............................ - // gap // .................................................................. - // gap // .................................................................. - add v23.8H, v30.8H, v9.8H // ..........................................e....................... - sub v26.8H, v8.8H, v25.8H // ....................................e............................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sqdmulh v8.8H, v29.8H, v7.H[1] // ..............................................e................... - // gap // .................................................................. - // gap // .................................................................. - sqdmulh v21.8H, v23.8H, v7.H[1] // .................................................e................ - sqrdmulh v0.8H, v19.8H, v6.H[1] // ............................................................*..... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - add v12.8H, v17.8H, v27.8H // ..........................................................*....... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - srshr v4.8H, v21.8H, #11 // ..................................................e............... - srshr v21.8H, v8.8H, #11 // ...............................................e.................. - // gap // .................................................................. - // gap // .................................................................. - mls v15.8H, v0.8H, v7.H[0] // .............................................................*.... - str q12, [x1, #-48] // ...............................................................*.. - // gap // .................................................................. - // gap // .................................................................. - str q14, [x1, #-32] // ................................................................*. - sub v12.8H, v30.8H, v9.8H // .........................................e........................ - // gap // .................................................................. - // gap // .................................................................. - mls v29.8H, v21.8H, v7.H[0] // ................................................e................. - mls v23.8H, v4.8H, v7.H[0] // ...................................................e.............. - // gap // .................................................................. - // gap // .................................................................. - str q15, [x1, #-16] // .................................................................* - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - ldr q6, [x3], #16 // ...................................e.............................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - add v2.8H, v29.8H, v23.8H // .....................................................e............ - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - str q2, [x1], #(64) // ..............................................................e... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. + ld4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x1] // e....................................................................... + mls v28.8H, v19.8H, v7.H[0] // .............................................*.......................... + sqrdmulh v19.8H, v21.8H, v5.H[1] // .............................................................*.......... + ldr q30, [x4, #16] // ..e..................................................................... + ldr q1, [x4, #64] // .....e.................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q14, [x4, #80] // ......e................................................................. + mul v11.8H, v21.8H, v5.H[0] // ............................................................*........... + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v12.8H, v28.8H, v7.H[1] // .......................................................*................ + ldr q8, [x4, #48] // ....e................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v27.8H, v0.8H, v7.H[1] // .................................................*...................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q13, [x4, #32] // ...e.................................................................... + mls v11.8H, v19.8H, v7.H[0] // ..............................................................*......... + // gap // ........................................................................ + // gap // ........................................................................ + add v22.8H, v17.8H, v18.8H // .............e.......................................................... + sub v6.8H, v17.8H, v18.8H // ............e........................................................... + ldr q29, [x4], #(6*16) // .e...................................................................... + // gap // ........................................................................ + sub v26.8H, v15.8H, v16.8H // .......e................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v31.8H, v6.8H, v14.8H // ...............e........................................................ + mul v24.8H, v6.8H, v1.8H // ..............e......................................................... + str q11, [x1, #-32] // ......................................................................*. + // gap // ........................................................................ + sqrdmulh v20.8H, v26.8H, v8.8H // ..........e............................................................. + mul v26.8H, v26.8H, v13.8H // .........e.............................................................. + // gap // ........................................................................ + // gap // ........................................................................ + add v8.8H, v15.8H, v16.8H // ........e............................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + srshr v13.8H, v27.8H, #11 // ..................................................*..................... + mls v24.8H, v31.8H, v7.H[0] // ................e....................................................... + // gap // ........................................................................ + // gap // ........................................................................ + mls v26.8H, v20.8H, v7.H[0] // ...........e............................................................ + sub v27.8H, v8.8H, v22.8H // .................e...................................................... + // gap // ........................................................................ + // gap // ........................................................................ + add v3.8H, v8.8H, v22.8H // ..................e..................................................... + srshr v22.8H, v12.8H, #11 // ........................................................*............... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v19.8H, v27.8H, v30.8H // ....................e................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v16.8H, v27.8H, v29.8H // ...................e.................................................... + sub v2.8H, v26.8H, v24.8H // ......................e................................................. + // gap // ........................................................................ + // gap // ........................................................................ + mls v28.8H, v22.8H, v7.H[0] // .........................................................*.............. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v14.8H, v2.8H, v30.8H // .........................e.............................................. + mul v22.8H, v2.8H, v29.8H // ........................e............................................... + // gap // ........................................................................ + // gap // ........................................................................ + add v2.8H, v26.8H, v24.8H // .......................e................................................ + mls v16.8H, v19.8H, v7.H[0] // .....................e.................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v22.8H, v14.8H, v7.H[0] // ..........................e............................................. + mls v0.8H, v13.8H, v7.H[0] // ...................................................*.................... + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v23.4S, v3.4S, v2.4S // ...........................e............................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v19.4S, v3.4S, v2.4S // ............................e........................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v20.4S, v16.4S, v22.4S // .............................e.......................................... + trn2 v13.4S, v16.4S, v22.4S // ..............................e......................................... + // gap // ........................................................................ + // gap // ........................................................................ + sub v6.8H, v0.8H, v28.8H // ...............................................................*........ + add v28.8H, v0.8H, v28.8H // ................................................................*....... + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v27.2D, v19.2D, v13.2D // ................................e....................................... + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v22.2D, v23.2D, v20.2D // ...............................e........................................ + trn1 v26.2D, v23.2D, v20.2D // .................................e...................................... + trn1 v24.2D, v19.2D, v13.2D // ..................................e..................................... + str q28, [x1, #-48] // .....................................................................*.. + // gap // ........................................................................ + add v23.8H, v22.8H, v27.8H // ..........................................e............................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v10.8H, v26.8H, v24.8H // .....................................e.................................. + sub v0.8H, v22.8H, v27.8H // .........................................e.............................. + // gap // ........................................................................ + // gap // ........................................................................ + sub v20.8H, v26.8H, v24.8H // ....................................e................................... + sqdmulh v28.8H, v23.8H, v7.H[1] // ....................................................e................... + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v22.8H, v10.8H, v7.H[1] // ..............................................e......................... + mul v19.8H, v6.8H, v5.H[0] // .................................................................*...... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v8.8H, v6.8H, v5.H[1] // ..................................................................*..... + ldr q5, [x3], #16 // ...................................e.................................... + // gap // ........................................................................ + // gap // ........................................................................ + srshr v28.8H, v28.8H, #11 // .....................................................e.................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + srshr v22.8H, v22.8H, #11 // ...............................................e........................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v19.8H, v8.8H, v7.H[0] // ...................................................................*.... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v23.8H, v28.8H, v7.H[0] // ......................................................e................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v10.8H, v22.8H, v7.H[0] // ................................................e....................... + sqrdmulh v30.8H, v20.8H, v5.H[3] // .......................................e................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q19, [x1, #-16] // .......................................................................* + mul v28.8H, v0.8H, v5.H[4] // ...........................................e............................ + sqrdmulh v19.8H, v0.8H, v5.H[5] // ............................................e........................... + // gap // ........................................................................ + mul v0.8H, v20.8H, v5.H[2] // ......................................e................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v25.8H, v10.8H, v23.8H // ...........................................................e............ + sub v21.8H, v10.8H, v23.8H // ..........................................................e............. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v0.8H, v30.8H, v7.H[0] // ........................................e............................... + str q25, [x1], #(64) // ....................................................................e... + // gap // ........................................................................ + // gap // ........................................................................ // original source code - // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // e.................................................................e.............................................................. - // ldr q0, [x4], #(6*16) // .e................................................................|e............................................................. - // ldr q4, [x4, #(-6*16 + 1*16)] // ............e.....................................................|...........e.................................................. - // ldr q1, [x4, #(-6*16 + 2*16)] // .......e..........................................................|......e....................................................... - // ldr q5, [x4, #(-6*16 + 3*16)] // .....e............................................................|....e......................................................... - // ldr q2, [x4, #(-6*16 + 4*16)] // ......e...........................................................|.....e........................................................ - // ldr q6, [x4, #(-6*16 + 5*16)] // .........e........................................................|........e..................................................... - // sub v24.8h, v8.8h, v9.8h // ...........e......................................................|..........e................................................... - // add v8.8h, v8.8h, v9.8h // ..........e.......................................................|.........e.................................................... - // mul v9.8h, v24.8h, v1.8h // ...............e..................................................|..............e............................................... - // sqrdmulh v24.8h, v24.8h, v5.8h // ..............e...................................................|.............e................................................ - // mls v9.8h, v24.8h, v7.h[0] // ...................e..............................................|..................e........................................... - // sub v24.8h, v10.8h, v11.8h // .............e....................................................|............e................................................. - // add v10.8h, v10.8h, v11.8h // ..................e...............................................|.................e............................................ - // mul v11.8h, v24.8h, v2.8h // ................e.................................................|...............e.............................................. - // sqrdmulh v24.8h, v24.8h, v6.8h // .................e................................................|................e............................................. - // mls v11.8h, v24.8h, v7.h[0] // ....................e.............................................|...................e.......................................... - // sub v24.8h, v8.8h, v10.8h // .....................e............................................|....................e......................................... - // add v8.8h, v8.8h, v10.8h // ......................e...........................................|.....................e........................................ - // mul v10.8h, v24.8h, v0.8h // ..........................e.......................................|.........................e.................................... - // sqrdmulh v24.8h, v24.8h, v4.8h // .......................e..........................................|......................e....................................... - // mls v10.8h, v24.8h, v7.h[0] // .................................e................................|................................e............................. - // sub v24.8h, v9.8h, v11.8h // .........................e........................................|........................e..................................... - // add v9.8h, v9.8h, v11.8h // ..............................e...................................|.............................e................................ - // mul v11.8h, v24.8h, v0.8h // ............................e.....................................|...........................e.................................. - // sqrdmulh v24.8h, v24.8h, v4.8h // .............................e....................................|............................e................................. - // mls v11.8h, v24.8h, v7.h[0] // ..................................e...............................|.................................e............................ - // trn1 v25.4s, v8.4s, v9.4s // ....................................e.............................|...................................e.......................... - // trn2 v26.4s, v8.4s, v9.4s // .....................................e............................|....................................e......................... - // trn1 v27.4s, v10.4s, v11.4s // ........................................e.........................|.......................................e...................... - // trn2 v28.4s, v10.4s, v11.4s // .......................................e..........................|......................................e....................... - // trn2 v10.2d, v25.2d, v27.2d // ...........................................e......................|..........................................e................... - // trn2 v11.2d, v26.2d, v28.2d // .............................................e....................|............................................e................. - // trn1 v8.2d, v25.2d, v27.2d // ..........................................e.......................|.........................................e.................... - // trn1 v9.2d, v26.2d, v28.2d // ............................................e.....................|...........................................e.................. - // ldr q0, [x3], #16 // ...............................................................e..|.............................................................. - // sub v24.8h, v8.8h, v9.8h // .................................................e................|................................................e............. - // add v8.8h, v8.8h, v9.8h // ...............................................e..................|..............................................e............... - // mul v9.8h, v24.8h, v0.h[2] // ...............................*..................................|..............................*............................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........................*......................................|..........................*................................... - // mls v9.8h, v24.8h, v7.h[0] // ...................................*..............................|..................................*........................... - // sub v24.8h, v10.8h, v11.8h // ...........................................................e......|..........................................................e... - // add v10.8h, v10.8h, v11.8h // ................................................e.................|...............................................e.............. - // mul v11.8h, v24.8h, v0.h[4] // ..*...............................................................|.*............................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...*..............................................................|..*........................................................... - // mls v11.8h, v24.8h, v7.h[0] // ........*.........................................................|.......*...................................................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // ..................................................e...............|.................................................e............ - // srshr v25.8h, v25.8h, #11 // .......................................................e..........|......................................................e....... - // mls v8.8h, v25.8h, v7.h[0] // ............................................................e.....|...........................................................e.. - // sqdmulh v25.8h, v10.8h, v7.h[1] // ...................................................e..............|..................................................e........... - // srshr v25.8h, v25.8h, #11 // ......................................................e...........|.....................................................e........ - // mls v10.8h, v25.8h, v7.h[0] // .............................................................e....|............................................................e. - // sub v24.8h, v8.8h, v10.8h // ....*.............................................................|...*.......................................................... - // add v8.8h, v8.8h, v10.8h // ................................................................e.|.............................................................. - // mul v10.8h, v24.8h, v0.h[0] // ........................*.........................................|.......................*...................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ................................*.................................|...............................*.............................. - // mls v10.8h, v24.8h, v7.h[0] // ......................................*...........................|.....................................*........................ - // sub v24.8h, v9.8h, v11.8h // .........................................*........................|........................................*..................... - // add v9.8h, v9.8h, v11.8h // .....................................................*............|....................................................*......... - // mul v11.8h, v24.8h, v0.h[0] // ..............................................*...................|.............................................*................ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ....................................................*.............|...................................................*.......... - // mls v11.8h, v24.8h, v7.h[0] // ........................................................*.........|.......................................................*...... - // str q8, [x1], #(64) // .................................................................e|.............................................................. - // str q9, [x1, #(-64 + 16*1)] // .........................................................*........|........................................................*..... - // str q10, [x1, #(-64 + 16*2)] // ..........................................................*.......|.........................................................*.... - // str q11, [x1, #(-64 + 16*3)] // ..............................................................*...|.............................................................* + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // e.......................................................................e................................................................ + // ldr q0, [x4], #(6*16) // ..............e.........................................................|.............e.................................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // ...e....................................................................|..e............................................................. + // ldr q1, [x4, #(-6*16 + 2*16)] // ..........e.............................................................|.........e...................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ........e...............................................................|.......e........................................................ + // ldr q2, [x4, #(-6*16 + 4*16)] // ....e...................................................................|...e............................................................ + // ldr q6, [x4, #(-6*16 + 5*16)] // .....e..................................................................|....e........................................................... + // sub v24.8h, v8.8h, v9.8h // ...............e........................................................|..............e................................................. + // add v8.8h, v8.8h, v9.8h // .....................e..................................................|....................e........................................... + // mul v9.8h, v24.8h, v1.8h // ....................e...................................................|...................e............................................ + // sqrdmulh v24.8h, v24.8h, v5.8h // ...................e....................................................|..................e............................................. + // mls v9.8h, v24.8h, v7.h[0] // ........................e...............................................|.......................e........................................ + // sub v24.8h, v10.8h, v11.8h // .............e..........................................................|............e................................................... + // add v10.8h, v10.8h, v11.8h // ............e...........................................................|...........e.................................................... + // mul v11.8h, v24.8h, v2.8h // .................e......................................................|................e............................................... + // sqrdmulh v24.8h, v24.8h, v6.8h // ................e.......................................................|...............e................................................ + // mls v11.8h, v24.8h, v7.h[0] // .......................e................................................|......................e......................................... + // sub v24.8h, v8.8h, v10.8h // .........................e..............................................|........................e....................................... + // add v8.8h, v8.8h, v10.8h // ..........................e.............................................|.........................e...................................... + // mul v10.8h, v24.8h, v0.8h // .............................e..........................................|............................e................................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ............................e...........................................|...........................e.................................... + // mls v10.8h, v24.8h, v7.h[0] // ...................................e....................................|..................................e............................. + // sub v24.8h, v9.8h, v11.8h // ..............................e.........................................|.............................e.................................. + // add v9.8h, v9.8h, v11.8h // ..................................e.....................................|.................................e.............................. + // mul v11.8h, v24.8h, v0.8h // .................................e......................................|................................e............................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ................................e.......................................|...............................e................................ + // mls v11.8h, v24.8h, v7.h[0] // ....................................e...................................|...................................e............................ + // trn1 v25.4s, v8.4s, v9.4s // ......................................e.................................|.....................................e.......................... + // trn2 v26.4s, v8.4s, v9.4s // .......................................e................................|......................................e......................... + // trn1 v27.4s, v10.4s, v11.4s // ........................................e...............................|.......................................e........................ + // trn2 v28.4s, v10.4s, v11.4s // .........................................e..............................|........................................e....................... + // trn2 v10.2d, v25.2d, v27.2d // .............................................e..........................|............................................e................... + // trn2 v11.2d, v26.2d, v28.2d // ............................................e...........................|...........................................e.................... + // trn1 v8.2d, v25.2d, v27.2d // ..............................................e.........................|.............................................e.................. + // trn1 v9.2d, v26.2d, v28.2d // ...............................................e........................|..............................................e................. + // ldr q0, [x3], #16 // .........................................................e..............|........................................................e....... + // sub v24.8h, v8.8h, v9.8h // ....................................................e...................|...................................................e............ + // add v8.8h, v8.8h, v9.8h // ..................................................e.....................|.................................................e.............. + // mul v9.8h, v24.8h, v0.h[2] // ...................................................................e....|................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...............................................................e........|..............................................................e. + // mls v9.8h, v24.8h, v7.h[0] // ......................................................................e.|................................................................ + // sub v24.8h, v10.8h, v11.8h // ...................................................e....................|..................................................e............. + // add v10.8h, v10.8h, v11.8h // .................................................e......................|................................................e............... + // mul v11.8h, v24.8h, v0.h[4] // .................................................................e......|................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..................................................................e.....|................................................................ + // mls v11.8h, v24.8h, v7.h[0] // .*......................................................................|*............................................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ......................................................e.................|.....................................................e.......... + // srshr v25.8h, v25.8h, #11 // ...........................................................e............|..........................................................e..... + // mls v8.8h, v25.8h, v7.h[0] // ..............................................................e.........|.............................................................e.. + // sqdmulh v25.8h, v9.8h, v7.h[1] // .........*..............................................................|........*....................................................... + // srshr v25.8h, v25.8h, #11 // ......................*.................................................|.....................*.......................................... + // mls v9.8h, v25.8h, v7.h[0] // .....................................*..................................|....................................*........................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // .....................................................e..................|....................................................e........... + // srshr v25.8h, v25.8h, #11 // ..........................................................e.............|.........................................................e...... + // mls v10.8h, v25.8h, v7.h[0] // .............................................................e..........|............................................................e... + // sqdmulh v25.8h, v11.8h, v7.h[1] // .......*................................................................|......*......................................................... + // srshr v25.8h, v25.8h, #11 // ...........................*............................................|..........................*..................................... + // mls v11.8h, v25.8h, v7.h[0] // ...............................*........................................|..............................*................................. + // sub v24.8h, v8.8h, v10.8h // .....................................................................e..|................................................................ + // add v8.8h, v8.8h, v10.8h // ....................................................................e...|................................................................ + // mul v10.8h, v24.8h, v0.h[0] // ......*.................................................................|.....*.......................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..*.....................................................................|.*.............................................................. + // mls v10.8h, v24.8h, v7.h[0] // ...........*............................................................|..........*..................................................... + // sub v24.8h, v9.8h, v11.8h // ..........................................*.............................|.........................................*...................... + // add v9.8h, v9.8h, v11.8h // ...........................................*............................|..........................................*..................... + // mul v11.8h, v24.8h, v0.h[0] // .......................................................*................|......................................................*......... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................................................*...............|.......................................................*........ + // mls v11.8h, v24.8h, v7.h[0] // ............................................................*...........|...........................................................*.... + // str q8, [x1], #(64) // .......................................................................e|................................................................ + // str q9, [x1, #(-64 + 16*1)] // ................................................*.......................|...............................................*................ + // str q10, [x1, #(-64 + 16*2)] // ..................*.....................................................|.................*.............................................. + // str q11, [x1, #(-64 + 16*3)] // ................................................................*.......|...............................................................* sub count, count, #1 cbnz count, layer4567_start - mul v4.8H, v26.8H, v6.H[2] // ......*........... - sqrdmulh v9.8H, v26.8H, v6.H[3] // .....*............ // gap // .................. + mls v28.8H, v19.8H, v7.H[0] // *................. // gap // .................. - sqrdmulh v5.8H, v12.8H, v6.H[5] // .*................ - mul v12.8H, v12.8H, v6.H[4] // *................. // gap // .................. + mul v27.8H, v21.8H, v5.H[0] // ..*............... // gap // .................. - sub v26.8H, v29.8H, v23.8H // ..*............... // gap // .................. // gap // .................. // gap // .................. - mls v4.8H, v9.8H, v7.H[0] // ........*......... + sqdmulh v19.8H, v0.8H, v7.H[1] // ....*............. // gap // .................. // gap // .................. // gap // .................. - mls v12.8H, v5.8H, v7.H[0] // ...*.............. - mul v22.8H, v26.8H, v6.H[0] // ....*............. + sqdmulh v22.8H, v28.8H, v7.H[1] // ...*.............. // gap // .................. // gap // .................. - sqrdmulh v29.8H, v26.8H, v6.H[1] // .......*.......... + sqrdmulh v24.8H, v21.8H, v5.H[1] // .*................ // gap // .................. // gap // .................. // gap // .................. + srshr v23.8H, v19.8H, #11 // .......*.......... // gap // .................. // gap // .................. // gap // .................. + srshr v19.8H, v22.8H, #11 // ........*......... // gap // .................. - sub v1.8H, v4.8H, v12.8H // ..........*....... - add v27.8H, v4.8H, v12.8H // .............*.... // gap // .................. // gap // .................. - mls v22.8H, v29.8H, v7.H[0] // .........*........ + mls v27.8H, v24.8H, v7.H[0] // .....*............ // gap // .................. // gap // .................. // gap // .................. - sqrdmulh v5.8H, v1.8H, v6.H[1] // ............*..... - mul v6.8H, v1.8H, v6.H[0] // ...........*...... - str q27, [x1, #-48] // ...............*.. // gap // .................. // gap // .................. // gap // .................. // gap // .................. // gap // .................. - str q22, [x1, #-32] // ................*. + mls v28.8H, v19.8H, v7.H[0] // .........*........ // gap // .................. + mls v0.8H, v23.8H, v7.H[0] // ..........*....... // gap // .................. // gap // .................. - mls v6.8H, v5.8H, v7.H[0] // ..............*... // gap // .................. // gap // .................. // gap // .................. // gap // .................. // gap // .................. // gap // .................. + sub v19.8H, v0.8H, v28.8H // ...........*...... // gap // .................. // gap // .................. // gap // .................. // gap // .................. // gap // .................. - str q6, [x1, #-16] // .................* + // gap // .................. + // gap // .................. + mul v23.8H, v19.8H, v5.H[0] // ..............*... + sqrdmulh v19.8H, v19.8H, v5.H[1] // ...............*.. + str q27, [x1, #-32] // ......*........... + // gap // .................. + add v22.8H, v0.8H, v28.8H // ............*..... + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + str q22, [x1, #-48] // .............*.... + mls v23.8H, v19.8H, v7.H[0] // ................*. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + str q23, [x1, #-16] // .................* // gap // .................. // gap // .................. // gap // .................. // original source code - // mul v27.8H, v12.8H, v6.H[4] // ...*.............. - // sqrdmulh v31.8H, v12.8H, v6.H[5] // ..*............... - // sub v18.8H, v29.8H, v23.8H // ....*............. - // mls v27.8H, v31.8H, v7.H[0] // ......*........... - // mul v14.8H, v18.8H, v6.H[0] // .......*.......... - // sqrdmulh v8.8H, v26.8H, v6.H[3] // .*................ - // mul v17.8H, v26.8H, v6.H[2] // *................. - // sqrdmulh v28.8H, v18.8H, v6.H[1] // ........*......... - // mls v17.8H, v8.8H, v7.H[0] // .....*............ - // mls v14.8H, v28.8H, v7.H[0] // ...........*...... - // sub v19.8H, v17.8H, v27.8H // .........*........ - // mul v15.8H, v19.8H, v6.H[0] // .............*.... - // sqrdmulh v0.8H, v19.8H, v6.H[1] // ............*..... - // add v12.8H, v17.8H, v27.8H // ..........*....... - // mls v15.8H, v0.8H, v7.H[0] // ................*. - // str q12, [x1, #-48] // ..............*... - // str q14, [x1, #-32] // ...............*.. - // str q15, [x1, #-16] // .................* + // mls v28.8H, v19.8H, v7.H[0] // *................. + // sqrdmulh v19.8H, v21.8H, v5.H[1] // ....*............. + // mul v11.8H, v21.8H, v5.H[0] // .*................ + // sqdmulh v12.8H, v28.8H, v7.H[1] // ...*.............. + // sqdmulh v27.8H, v0.8H, v7.H[1] // ..*............... + // mls v11.8H, v19.8H, v7.H[0] // .......*.......... + // str q11, [x1, #-32] // .............*.... + // srshr v13.8H, v27.8H, #11 // .....*............ + // srshr v22.8H, v12.8H, #11 // ......*........... + // mls v28.8H, v22.8H, v7.H[0] // ........*......... + // mls v0.8H, v13.8H, v7.H[0] // .........*........ + // sub v6.8H, v0.8H, v28.8H // ..........*....... + // add v28.8H, v0.8H, v28.8H // ..............*... + // str q28, [x1, #-48] // ...............*.. + // mul v19.8H, v6.8H, v5.H[0] // ...........*...... + // sqrdmulh v8.8H, v6.8H, v5.H[1] // ............*..... + // mls v19.8H, v8.8H, v7.H[0] // ................*. + // str q19, [x1, #-16] // .................* // --------------------------------------------------------------------- @@ -916,554 +948,526 @@ layer4567_start: .p2align 2 - ldr q31, [x0, #256] // .....*........................ - ldr q20, [x0, #320] // *............................. - // gap // .............................. - // gap // .............................. - ldr q13, [x0, #128] // ...*.......................... - ldr q26, [x0, #192] // ....*......................... - // gap // .............................. - // gap // .............................. - ldr q23, [x0, #0] // .......*...................... - // gap // .............................. - // gap // .............................. - ldr q3, [x0, #64] // .*............................ - ldr q11, [x0, #384] // ......*....................... - // gap // .............................. - // gap // .............................. - ldr q15, [x0, #448] // ..*........................... - sub v2.8H, v31.8H, v20.8H // ..........*................... - add v21.8H, v31.8H, v20.8H // .........*.................... - // gap // .............................. - // gap // .............................. - sub v20.8H, v13.8H, v26.8H // ........................*..... - add v5.8H, v13.8H, v26.8H // ............*................. - // gap // .............................. - // gap // .............................. - mul v31.8H, v2.8H, v1.H[2] // ...................*.......... - add v16.8H, v23.8H, v3.8H // ........*..................... - // gap // .............................. - // gap // .............................. - sqrdmulh v2.8H, v2.8H, v1.H[3] // .................*............ - add v28.8H, v11.8H, v15.8H // ...........*.................. - // gap // .............................. - // gap // .............................. - sub v17.8H, v16.8H, v5.8H // ................*............. - // gap // .............................. - // gap // .............................. - mul v10.8H, v20.8H, v1.H[0] // ............................*. - add v8.8H, v16.8H, v5.8H // .....................*........ - // gap // .............................. - // gap // .............................. - add v13.8H, v21.8H, v28.8H // .............*................ - sub v5.8H, v23.8H, v3.8H // ..............*............... - mul v3.8H, v17.8H, v0.H[2] // .......................*...... - // gap // .............................. - // gap // .............................. - sqdmulh v16.8H, v13.8H, v7.H[1] // ...............*.............. - mls v31.8H, v2.8H, v7.H[0] // ......................*....... - // gap // .............................. - // gap // .............................. - mul v9.8H, v5.8H, v0.H[6] // ..................*........... - // gap // .............................. - // gap // .............................. - sqdmulh v24.8H, v8.8H, v7.H[1] // ...........................*.. - sqrdmulh v19.8H, v5.8H, v0.H[7] // ..........................*... - // gap // .............................. - // gap // .............................. - sqrdmulh v20.8H, v20.8H, v1.H[1] // .............................* - // gap // .............................. - // gap // .............................. - sqrdmulh v23.8H, v17.8H, v0.H[3] // .........................*.... - srshr v16.8H, v16.8H, #11 // ....................*......... + ldr q31, [x0, #320] // .*................................ + ldr q22, [x0, #256] // *................................. + // gap // .................................. + // gap // .................................. + ldr q8, [x0, #448] // ....*............................. + ldr q16, [x0, #384] // ..*............................... + // gap // .................................. + // gap // .................................. + ldr q4, [x0, #64] // .......*.......................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + ldr q9, [x0, #0] // .....*............................ + // gap // .................................. + // gap // .................................. + // gap // .................................. + sub v3.8H, v22.8H, v31.8H // ......*........................... + add v6.8H, v22.8H, v31.8H // ........*......................... + ldr q18, [x0, #128] // ...*.............................. + // gap // .................................. + sub v23.8H, v16.8H, v8.8H // ..........*....................... + add v22.8H, v16.8H, v8.8H // .............*.................... + ldr q19, [x0, #192] // ..................*............... + // gap // .................................. + sqrdmulh v31.8H, v3.8H, v1.H[3] // .........*........................ + mul v13.8H, v3.8H, v1.H[2] // ...........*...................... + // gap // .................................. + // gap // .................................. + mul v2.8H, v23.8H, v1.H[4] // ..............*................... + sqrdmulh v11.8H, v23.8H, v1.H[5] // ...............*.................. + // gap // .................................. + // gap // .................................. + sub v25.8H, v9.8H, v4.8H // ............*..................... + add v9.8H, v9.8H, v4.8H // ................*................. + // gap // .................................. + // gap // .................................. + sub v21.8H, v18.8H, v19.8H // .............................*.... + mls v13.8H, v31.8H, v7.H[0] // .................*................ + // gap // .................................. + // gap // .................................. + add v26.8H, v18.8H, v19.8H // ....................*............. + mls v2.8H, v11.8H, v7.H[0] // ...................*.............. + // gap // .................................. + // gap // .................................. + sqrdmulh v23.8H, v25.8H, v0.H[7] // ...........................*...... + // gap // .................................. + // gap // .................................. + // gap // .................................. + add v31.8H, v9.8H, v26.8H // ............................*..... + sub v19.8H, v6.8H, v22.8H // ..........................*....... + // gap // .................................. + // gap // .................................. + add v6.8H, v6.8H, v22.8H // ........................*......... + sub v28.8H, v13.8H, v2.8H // ......................*........... + // gap // .................................. + // gap // .................................. + mul v15.8H, v25.8H, v0.H[6] // .....................*............ + // gap // .................................. + // gap // .................................. + // gap // .................................. + mul v16.8H, v28.8H, v0.H[4] // .......................*.......... + sqrdmulh v11.8H, v28.8H, v0.H[5] // .........................*........ + // gap // .................................. + // gap // .................................. + add v24.8H, v31.8H, v6.8H // ................................*. + mul v5.8H, v19.8H, v0.H[4] // ...............................*.. + // gap // .................................. + // gap // .................................. + mls v15.8H, v23.8H, v7.H[0] // .................................* + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v16.8H, v11.8H, v7.H[0] // ..............................*... + // gap // .................................. + // gap // .................................. + // gap // .................................. // original source code - // ldr q12, [x0, #320] // .*............................ - // ldr q14, [x0, #64] // .....*........................ - // ldr q15, [x0, #448] // .......*...................... - // ldr q4, [x0, #128] // ..*........................... - // ldr q5, [x0, #192] // ...*.......................... - // ldr q17, [x0, #256] // *............................. - // ldr q11, [x0, #384] // ......*....................... - // ldr q10, [x0, #0] // ....*......................... - // add v27.8H, v10.8H, v14.8H // .............*................ - // add v21.8H, v17.8H, v12.8H // .........*.................... - // sub v12.8H, v17.8H, v12.8H // ........*..................... - // add v28.8H, v11.8H, v15.8H // ...............*.............. - // add v18.8H, v4.8H, v5.8H // ...........*.................. - // add v13.8H, v21.8H, v28.8H // ...................*.......... - // sub v17.8H, v10.8H, v14.8H // ....................*......... - // sqdmulh v8.8H, v13.8H, v7.H[1] // ......................*....... - // sub v10.8H, v27.8H, v18.8H // ................*............. - // sqrdmulh v19.8H, v12.8H, v1.H[3] // ..............*............... - // mul v9.8H, v17.8H, v0.H[6] // ........................*..... - // mul v31.8H, v12.8H, v1.H[2] // ............*................. - // srshr v16.8H, v8.8H, #11 // .............................* - // add v8.8H, v27.8H, v18.8H // ..................*........... - // mls v31.8H, v19.8H, v7.H[0] // .......................*...... - // mul v3.8H, v10.8H, v0.H[2] // .....................*........ - // sub v4.8H, v4.8H, v5.8H // ..........*................... - // sqrdmulh v23.8H, v10.8H, v0.H[3] // ............................*. - // sqrdmulh v19.8H, v17.8H, v0.H[7] // ..........................*... - // sqdmulh v24.8H, v8.8H, v7.H[1] // .........................*.... - // mul v10.8H, v4.8H, v1.H[0] // .................*............ - // sqrdmulh v20.8H, v4.8H, v1.H[1] // ...........................*.. + // ldr q28, [x0, #256] // .*................................ + // ldr q18, [x0, #320] // *................................. + // ldr q23, [x0, #384] // ...*.............................. + // ldr q10, [x0, #128] // ........*......................... + // ldr q12, [x0, #448] // ..*............................... + // ldr q4, [x0, #0] // .....*............................ + // sub v17.8H, v28.8H, v18.8H // ......*........................... + // ldr q9, [x0, #64] // ....*............................. + // add v18.8H, v28.8H, v18.8H // .......*.......................... + // sqrdmulh v19.8H, v17.8H, v1.H[3] // ............*..................... + // sub v6.8H, v23.8H, v12.8H // .........*........................ + // mul v13.8H, v17.8H, v1.H[2] // .............*.................... + // sub v11.8H, v4.8H, v9.8H // ................*................. + // add v25.8H, v23.8H, v12.8H // ..........*....................... + // mul v2.8H, v6.8H, v1.H[4] // ..............*................... + // sqrdmulh v17.8H, v6.8H, v1.H[5] // ...............*.................. + // add v9.8H, v4.8H, v9.8H // .................*................ + // mls v13.8H, v19.8H, v7.H[0] // ...................*.............. + // ldr q21, [x0, #192] // ...........*...................... + // mls v2.8H, v17.8H, v7.H[0] // .....................*............ + // add v26.8H, v10.8H, v21.8H // ....................*............. + // mul v15.8H, v11.8H, v0.H[6] // ...........................*...... + // sub v5.8H, v13.8H, v2.8H // ..........................*....... + // mul v16.8H, v5.8H, v0.H[4] // ............................*..... + // add v6.8H, v18.8H, v25.8H // .........................*........ + // sqrdmulh v22.8H, v5.8H, v0.H[5] // .............................*.... + // sub v19.8H, v18.8H, v25.8H // ........................*......... + // sqrdmulh v20.8H, v11.8H, v0.H[7] // ......................*........... + // add v31.8H, v9.8H, v26.8H // .......................*.......... + // sub v21.8H, v10.8H, v21.8H // ..................*............... + // mls v16.8H, v22.8H, v7.H[0] // .................................* + // mul v5.8H, v19.8H, v0.H[4] // ...............................*.. + // add v24.8H, v31.8H, v6.8H // ..............................*... + // mls v15.8H, v20.8H, v7.H[0] // ................................*. sub count, count, #1 layer123_start: - ldr q12, [x0, #336] // .....e........................................................................................ - ldr q14, [x0, #80] // .e............................................................................................ - mls v3.8H, v23.8H, v7.H[0] // ................................*............................................................. - sub v5.8H, v11.8H, v15.8H // .......................*...................................................................... - mls v9.8H, v19.8H, v7.H[0] // ............*................................................................................. - srshr v26.8H, v24.8H, #11 // .................................................*............................................ - ldr q15, [x0, #464] // .......e...................................................................................... - // gap // .............................................................................................. - mul v6.8H, v5.8H, v1.H[4] // .........................*.................................................................... - sqrdmulh v2.8H, v5.8H, v1.H[5] // ..........................*................................................................... - // gap // .............................................................................................. - ldr q4, [x0, #144] // ..e........................................................................................... - // gap // .............................................................................................. - sub v24.8H, v21.8H, v28.8H // ......................................*....................................................... - ldr q5, [x0, #208] // ...e.......................................................................................... - mls v10.8H, v20.8H, v7.H[0] // .................*............................................................................ - mls v8.8H, v26.8H, v7.H[0] // ..................................................*........................................... - ldr q17, [x0, #272] // ....e......................................................................................... - // gap // .............................................................................................. - mls v13.8H, v16.8H, v7.H[0] // .....................................................*........................................ - mls v6.8H, v2.8H, v7.H[0] // ...........................*.................................................................. - mul v16.8H, v24.8H, v0.H[4] // ........................................*..................................................... - ldr q11, [x0, #400] // ......e....................................................................................... - // gap // .............................................................................................. - sqrdmulh v19.8H, v24.8H, v0.H[5] // .........................................*.................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v28.8H, v9.8H, v10.8H // .................................*............................................................ - add v20.8H, v9.8H, v10.8H // ..................................*........................................................... - add v18.8H, v8.8H, v13.8H // .......................................................*...................................... - ldr q10, [x0, #16] // e............................................................................................. - // gap // .............................................................................................. - sub v2.8H, v8.8H, v13.8H // ......................................................*....................................... - add v26.8H, v31.8H, v6.8H // ............................................*................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v16.8H, v19.8H, v7.H[0] // ..........................................*................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v13.8H, v18.8H, v30.8H // ...............................................................................*.............. - sub v8.8H, v31.8H, v6.8H // ...........................................*.................................................. - mul v31.8H, v18.8H, v29.8H // ..............................................................................*............... - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v24.8H, v28.8H, v0.H[2] // ...................................*.......................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v22.8H, v2.8H, v0.H[0] // ........................................................*..................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v27.8H, v2.8H, v0.H[1] // .........................................................*.................................... - add v2.8H, v3.8H, v16.8H // .................................................................*............................ - sub v25.8H, v3.8H, v16.8H // ................................................................*............................. - mls v31.8H, v13.8H, v7.H[0] // ................................................................................*............. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v13.8H, v8.8H, v0.H[4] // .............................................*................................................ - sqrdmulh v23.8H, v2.8H, v30.8H // .....................................................................................*........ - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v8.8H, v8.8H, v0.H[5] // ..............................................*............................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v3.8H, v25.8H, v0.H[1] // ...................................................................*.......................... - mls v22.8H, v27.8H, v7.H[0] // ..........................................................*................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v19.8H, v28.8H, v0.H[3] // ....................................*......................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v6.8H, v25.8H, v0.H[0] // ..................................................................*........................... - sub v25.8H, v20.8H, v26.8H // ...........................................................*.................................. - add v26.8H, v20.8H, v26.8H // ............................................................*................................. - mls v13.8H, v8.8H, v7.H[0] // ...............................................*.............................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v24.8H, v19.8H, v7.H[0] // .....................................*........................................................ - add v27.8H, v10.8H, v14.8H // .........e.................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v21.8H, v17.8H, v12.8H // ...................e.......................................................................... - sqrdmulh v16.8H, v26.8H, v30.8H // ..................................................................................*........... - sub v12.8H, v17.8H, v12.8H // ..................e........................................................................... - mul v8.8H, v26.8H, v29.8H // .................................................................................*............ - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v19.8H, v24.8H, v13.8H // .....................................................................*........................ - // gap // .............................................................................................. - // gap // .............................................................................................. - add v28.8H, v11.8H, v15.8H // ........................e..................................................................... - mul v2.8H, v2.8H, v29.8H // ....................................................................................*......... - add v24.8H, v24.8H, v13.8H // ......................................................................*....................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v8.8H, v16.8H, v7.H[0] // ...................................................................................*.......... - add v18.8H, v4.8H, v5.8H // ..............e............................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v13.8H, v21.8H, v28.8H // .......................................e...................................................... - sqrdmulh v20.8H, v24.8H, v30.8H // ........................................................................................*..... - sub v17.8H, v10.8H, v14.8H // ........e..................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v24.8H, v24.8H, v29.8H // .......................................................................................*...... - str q8, [x0, #64] // ...........................................................................................*.. - mul v14.8H, v19.8H, v0.H[0] // .......................................................................*...................... - // gap // .............................................................................................. - sqdmulh v8.8H, v13.8H, v7.H[1] // ...................................................e.......................................... - str q31, [x0], #(16) // ..........................................................................................*... - sqrdmulh v16.8H, v19.8H, v0.H[1] // ........................................................................*..................... - // gap // .............................................................................................. - sub v10.8H, v27.8H, v18.8H // ............................e................................................................. - mls v24.8H, v20.8H, v7.H[0] // .........................................................................................*.... - sqrdmulh v19.8H, v12.8H, v1.H[3] // .....................e........................................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v20.8H, v25.8H, v0.H[1] // ..............................................................*............................... - mul v9.8H, v17.8H, v0.H[6] // ..........e................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v31.8H, v12.8H, v1.H[2] // ....................e......................................................................... - mls v14.8H, v16.8H, v7.H[0] // .........................................................................*.................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v26.8H, v25.8H, v0.H[0] // .............................................................*................................ - srshr v16.8H, v8.8H, #11 // ....................................................e......................................... - str q24, [x0, #176] // .............................................................................................* - // gap // .............................................................................................. - mls v2.8H, v23.8H, v7.H[0] // ......................................................................................*....... - add v8.8H, v27.8H, v18.8H // .............................e................................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v6.8H, v3.8H, v7.H[0] // ....................................................................*......................... - mls v31.8H, v19.8H, v7.H[0] // ......................e....................................................................... - str q14, [x0, #432] // .............................................................................*................ - // gap // .............................................................................................. - str q22, [x0, #240] // ..........................................................................*................... - mls v26.8H, v20.8H, v7.H[0] // ...............................................................*.............................. - mul v3.8H, v10.8H, v0.H[2] // ..............................e............................................................... - // gap // .............................................................................................. - sub v4.8H, v4.8H, v5.8H // .............e................................................................................ - sqrdmulh v23.8H, v10.8H, v0.H[3] // ...............................e.............................................................. - str q2, [x0, #112] // ............................................................................................*. - // gap // .............................................................................................. - sqrdmulh v19.8H, v17.8H, v0.H[7] // ...........e.................................................................................. - str q6, [x0, #368] // ............................................................................*................. - // gap // .............................................................................................. - sqdmulh v24.8H, v8.8H, v7.H[1] // ................................................e............................................. - str q26, [x0, #304] // ...........................................................................*.................. - mul v10.8H, v4.8H, v1.H[0] // ...............e.............................................................................. - sqrdmulh v20.8H, v4.8H, v1.H[1] // ................e............................................................................. - // gap // .............................................................................................. + sub v27.8H, v31.8H, v6.8H // ................................................*....................................... + sqrdmulh v22.8H, v19.8H, v0.H[5] // .........................................*.............................................. + ldr q28, [x0, #272] // ....e................................................................................... + ldr q18, [x0, #336] // .....e.................................................................................. + // gap // ........................................................................................ + sub v19.8H, v9.8H, v26.8H // ............................*........................................................... + add v26.8H, v13.8H, v2.8H // ............................................*........................................... + ldr q23, [x0, #400] // ......e................................................................................. + ldr q10, [x0, #144] // ..e..................................................................................... + mul v20.8H, v24.8H, v29.8H // ........................................................................*............... + ldr q12, [x0, #464] // .......e................................................................................ + mul v31.8H, v27.8H, v0.H[0] // ..................................................*..................................... + mul v8.8H, v19.8H, v0.H[2] // ..............................*......................................................... + // gap // ........................................................................................ + sqrdmulh v14.8H, v19.8H, v0.H[3] // ...............................*........................................................ + // gap // ........................................................................................ + sqrdmulh v27.8H, v27.8H, v0.H[1] // ...................................................*.................................... + ldr q4, [x0, #16] // e....................................................................................... + // gap // ........................................................................................ + sub v17.8H, v28.8H, v18.8H // ..................e..................................................................... + mls v5.8H, v22.8H, v7.H[0] // ..........................................*............................................. + // gap // ........................................................................................ + ldr q9, [x0, #80] // .e...................................................................................... + add v18.8H, v28.8H, v18.8H // ...................e.................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v19.8H, v17.8H, v1.H[3] // .....................e.................................................................. + sub v6.8H, v23.8H, v12.8H // .......................e................................................................ + mul v13.8H, v17.8H, v1.H[2] // ....................e................................................................... + // gap // ........................................................................................ + mls v31.8H, v27.8H, v7.H[0] // ....................................................*................................... + // gap // ........................................................................................ + mul v27.8H, v21.8H, v1.H[0] // ...............*........................................................................ + sqrdmulh v17.8H, v21.8H, v1.H[1] // ................*....................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v8.8H, v14.8H, v7.H[0] // ................................*....................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v11.8H, v4.8H, v9.8H // ........e............................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + add v25.8H, v23.8H, v12.8H // ........................e............................................................... + mul v2.8H, v6.8H, v1.H[4] // .........................e.............................................................. + str q31, [x0, #256] // ....................................................................*................... + mls v27.8H, v17.8H, v7.H[0] // .................*...................................................................... + // gap // ........................................................................................ + sqrdmulh v17.8H, v6.8H, v1.H[5] // ..........................e............................................................. + sub v3.8H, v8.8H, v5.8H // ..........................................................*............................. + add v28.8H, v8.8H, v5.8H // ...........................................................*............................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v9.8H, v4.8H, v9.8H // .........e.............................................................................. + mls v13.8H, v19.8H, v7.H[0] // ......................e................................................................. + // gap // ........................................................................................ + ldr q21, [x0, #208] // ...e.................................................................................... + sub v12.8H, v15.8H, v27.8H // .................................*...................................................... + add v23.8H, v15.8H, v27.8H // ..................................*..................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v27.8H, v28.8H, v29.8H // ..............................................................................*......... + mls v2.8H, v17.8H, v7.H[0] // ...........................e............................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v17.8H, v12.8H, v0.H[2] // ...................................*.................................................... + add v19.8H, v23.8H, v26.8H // ......................................................*................................. + sqrdmulh v22.8H, v12.8H, v0.H[3] // ....................................*................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v31.8H, v23.8H, v26.8H // .....................................................*.................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v4.8H, v19.8H, v29.8H // ...........................................................................*............ + add v26.8H, v10.8H, v21.8H // ..............e......................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v6.8H, v31.8H, v0.H[1] // ........................................................*............................... + mul v31.8H, v31.8H, v0.H[0] // .......................................................*................................ + mls v17.8H, v22.8H, v7.H[0] // .....................................*.................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v15.8H, v11.8H, v0.H[6] // ..........e............................................................................. + sqrdmulh v8.8H, v19.8H, v30.8H // ............................................................................*........... + sub v5.8H, v13.8H, v2.8H // ...........................................e............................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v23.8H, v28.8H, v30.8H // ...............................................................................*........ + mls v31.8H, v6.8H, v7.H[0] // .........................................................*.............................. + sub v12.8H, v17.8H, v16.8H // ...............................................................*........................ + // gap // ........................................................................................ + add v22.8H, v17.8H, v16.8H // ................................................................*....................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v16.8H, v5.8H, v0.H[4] // .............................................e.......................................... + add v6.8H, v18.8H, v25.8H // .......................................e................................................ + mul v19.8H, v22.8H, v29.8H // .................................................................................*...... + sqrdmulh v22.8H, v22.8H, v30.8H // ..................................................................................*..... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v17.8H, v24.8H, v30.8H // .........................................................................*.............. + mul v24.8H, v12.8H, v0.H[0] // .................................................................*...................... + str q31, [x0, #320] // .....................................................................*.................. + // gap // ........................................................................................ + sqrdmulh v28.8H, v12.8H, v0.H[1] // ..................................................................*..................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v27.8H, v23.8H, v7.H[0] // ................................................................................*....... + mls v19.8H, v22.8H, v7.H[0] // ...................................................................................*.... + sqrdmulh v14.8H, v3.8H, v0.H[1] // .............................................................*.......................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v20.8H, v17.8H, v7.H[0] // ..........................................................................*............. + // gap // ........................................................................................ + mul v12.8H, v3.8H, v0.H[0] // ............................................................*........................... + // gap // ........................................................................................ + mls v4.8H, v8.8H, v7.H[0] // .............................................................................*.......... + mls v24.8H, v28.8H, v7.H[0] // ...................................................................*.................... + str q27, [x0, #128] // ......................................................................................*. + str q19, [x0, #192] // .......................................................................................* + sqrdmulh v22.8H, v5.8H, v0.H[5] // ..............................................e......................................... + // gap // ........................................................................................ + sub v19.8H, v18.8H, v25.8H // ......................................e................................................. + mls v12.8H, v14.8H, v7.H[0] // ..............................................................*......................... + str q20, [x0], #(16) // ....................................................................................*... + sqrdmulh v20.8H, v11.8H, v0.H[7] // ...........e............................................................................ + // gap // ........................................................................................ + str q24, [x0, #432] // .......................................................................*................ + add v31.8H, v9.8H, v26.8H // .............................e.......................................................... + sub v21.8H, v10.8H, v21.8H // .............e.......................................................................... + // gap // ........................................................................................ + mls v16.8H, v22.8H, v7.H[0] // ...............................................e........................................ + str q4, [x0, #48] // .....................................................................................*.. + // gap // ........................................................................................ + mul v5.8H, v19.8H, v0.H[4] // ........................................e............................................... + str q12, [x0, #368] // ......................................................................*................. + add v24.8H, v31.8H, v6.8H // .................................................e...................................... + // gap // ........................................................................................ + mls v15.8H, v20.8H, v7.H[0] // ............e........................................................................... // original source code - // ldr q8, [x0, #0] // .......................e......................................................................|......................e.................................................................... - // ldr q9, [x0, #(1*(512/8))] // .e............................................................................................|e.......................................................................................... - // ldr q10, [x0, #(2*(512/8))] // .........e....................................................................................|........e.................................................................................. - // ldr q11, [x0, #(3*(512/8))] // ...........e..................................................................................|..........e................................................................................ - // ldr q12, [x0, #(4*(512/8))] // ..............e...............................................................................|.............e............................................................................. - // ldr q13, [x0, #(5*(512/8))] // e.............................................................................................e........................................................................................... - // ldr q14, [x0, #(6*(512/8))] // ..................e...........................................................................|.................e......................................................................... - // ldr q15, [x0, #(7*(512/8))] // ......e.......................................................................................|.....e..................................................................................... - // sub v24.8h, v8.8h, v9.8h // ............................................................e.................................|...........................................................e............................... - // add v8.8h, v8.8h, v9.8h // ...............................................e..............................................|..............................................e............................................ - // mul v9.8h, v24.8h, v0.h[6] // .......................................................................e......................|......................................................................e.................... - // sqrdmulh v24.8h, v24.8h, v0.h[7] // ........................................................................................e.....|.......................................................................................e... - // mls v9.8h, v24.8h, v7.h[0] // ....*.........................................................................................|...*....................................................................................... - // sub v24.8h, v10.8h, v11.8h // .....................................................................................e........|....................................................................................e...... - // add v10.8h, v10.8h, v11.8h // .........................................................e....................................|........................................................e.................................. - // mul v11.8h, v24.8h, v1.h[0] // ............................................................................................e.|........................................................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[1] // .............................................................................................e|........................................................................................... - // mls v11.8h, v24.8h, v7.h[0] // ............*.................................................................................|...........*............................................................................... - // sub v24.8h, v12.8h, v13.8h // ..................................................e...........................................|.................................................e......................................... - // add v12.8h, v12.8h, v13.8h // ................................................e.............................................|...............................................e........................................... - // mul v13.8h, v24.8h, v1.h[2] // ........................................................................e.....................|.......................................................................e................... - // sqrdmulh v24.8h, v24.8h, v1.h[3] // .....................................................................e........................|....................................................................e...................... - // mls v13.8h, v24.8h, v7.h[0] // ................................................................................e.............|...............................................................................e........... - // sub v24.8h, v14.8h, v15.8h // ...*..........................................................................................|..*........................................................................................ - // add v14.8h, v14.8h, v15.8h // .....................................................e........................................|....................................................e...................................... - // mul v15.8h, v24.8h, v1.h[4] // .......*......................................................................................|......*.................................................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[5] // ........*.....................................................................................|.......*................................................................................... - // mls v15.8h, v24.8h, v7.h[0] // ................*.............................................................................|...............*........................................................................... - // sub v24.8h, v8.8h, v10.8h // ...................................................................e..........................|..................................................................e........................ - // add v8.8h, v8.8h, v10.8h // ..............................................................................e...............|.............................................................................e............. - // mul v10.8h, v24.8h, v0.h[2] // ....................................................................................e.........|...................................................................................e....... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ......................................................................................e.......|.....................................................................................e..... - // mls v10.8h, v24.8h, v7.h[0] // ..*...........................................................................................|.*......................................................................................... - // sub v24.8h, v9.8h, v11.8h // ....................*.........................................................................|...................*....................................................................... - // add v9.8h, v9.8h, v11.8h // .....................*........................................................................|....................*...................................................................... - // mul v11.8h, v24.8h, v0.h[2] // ..............................*...............................................................|.............................*............................................................. - // sqrdmulh v24.8h, v24.8h, v0.h[3] // .........................................*....................................................|........................................*.................................................. - // mls v11.8h, v24.8h, v7.h[0] // ..............................................*...............................................|.............................................*............................................. - // sub v24.8h, v12.8h, v14.8h // ..........*...................................................................................|.........*................................................................................. - // add v12.8h, v12.8h, v14.8h // ..........................................................e...................................|.........................................................e................................. - // mul v14.8h, v24.8h, v0.h[4] // .................*............................................................................|................*.......................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...................*..........................................................................|..................*........................................................................ - // mls v14.8h, v24.8h, v7.h[0] // ..........................*...................................................................|.........................*................................................................. - // sub v24.8h, v13.8h, v15.8h // ............................*.................................................................|...........................*............................................................... - // add v13.8h, v13.8h, v15.8h // .........................*....................................................................|........................*.................................................................. - // mul v15.8h, v24.8h, v0.h[4] // ....................................*.........................................................|...................................*....................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ......................................*.......................................................|.....................................*..................................................... - // mls v15.8h, v24.8h, v7.h[0] // .............................................*................................................|............................................*.............................................. - // sqdmulh v25.8h, v8.8h, v7.h[1] // ..........................................................................................e...|.........................................................................................e. - // srshr v25.8h, v25.8h, #11 // .....*........................................................................................|....*...................................................................................... - // mls v8.8h, v25.8h, v7.h[0] // .............*................................................................................|............*.............................................................................. - // sqdmulh v25.8h, v12.8h, v7.h[1] // ................................................................e.............................|...............................................................e........................... - // srshr v25.8h, v25.8h, #11 // ...........................................................................e..................|..........................................................................e................ - // mls v12.8h, v25.8h, v7.h[0] // ...............*..............................................................................|..............*............................................................................ - // sub v24.8h, v8.8h, v12.8h // ........................*.....................................................................|.......................*................................................................... - // add v8.8h, v8.8h, v12.8h // ......................*.......................................................................|.....................*..................................................................... - // mul v12.8h, v24.8h, v0.h[0] // ...............................*..............................................................|..............................*............................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ................................*.............................................................|...............................*........................................................... - // mls v12.8h, v24.8h, v7.h[0] // ........................................*.....................................................|.......................................*................................................... - // sub v24.8h, v9.8h, v13.8h // ...........................................*..................................................|..........................................*................................................ - // add v9.8h, v9.8h, v13.8h // ............................................*.................................................|...........................................*............................................... - // mul v13.8h, v24.8h, v0.h[0] // ..........................................................................*...................|.........................................................................*................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ......................................................................*.......................|.....................................................................*..................... - // mls v13.8h, v24.8h, v7.h[0] // ...................................................................................*..........|..................................................................................*........ - // sub v24.8h, v10.8h, v14.8h // ..................................*...........................................................|.................................*......................................................... - // add v10.8h, v10.8h, v14.8h // .................................*............................................................|................................*.......................................................... - // mul v14.8h, v24.8h, v0.h[0] // ..........................................*...................................................|.........................................*................................................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................................*......................................................|......................................*.................................................... - // mls v14.8h, v24.8h, v7.h[0] // ...............................................................................*..............|..............................................................................*............ - // sub v24.8h, v11.8h, v15.8h // ....................................................*.........................................|...................................................*....................................... - // add v11.8h, v11.8h, v15.8h // .......................................................*......................................|......................................................*.................................... - // mul v15.8h, v24.8h, v0.h[0] // ...............................................................*..............................|..............................................................*............................ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..................................................................*...........................|.................................................................*......................... - // mls v15.8h, v24.8h, v7.h[0] // .........................................................................*....................|........................................................................*.................. - // str q12, [x0, #(4*(512/8))] // ..................................................................................*...........|.................................................................................*......... - // str q13, [x0, #(5*(512/8))] // ...........................................................................................*..|..........................................................................................* - // str q14, [x0, #(6*(512/8))] // .........................................................................................*....|........................................................................................*.. - // str q15, [x0, #(7*(512/8))] // .................................................................................*............|................................................................................*.......... - // mul v12.8h, v8.8h, v29.8h // .............................*................................................................|............................*.............................................................. - // sqrdmulh v8.8h, v8.8h, v30.8h // ...........................*..................................................................|..........................*................................................................ - // mls v12.8h, v8.8h, v7.h[0] // ...................................*..........................................................|..................................*........................................................ - // mul v13.8h, v9.8h, v29.8h // ...................................................*..........................................|..................................................*........................................ - // sqrdmulh v9.8h, v9.8h, v30.8h // .................................................*............................................|................................................*.......................................... - // mls v13.8h, v9.8h, v7.h[0] // ........................................................*.....................................|.......................................................*................................... - // mul v14.8h, v10.8h, v29.8h // ......................................................*.......................................|.....................................................*..................................... - // sqrdmulh v10.8h, v10.8h, v30.8h // .....................................*........................................................|....................................*...................................................... - // mls v14.8h, v10.8h, v7.h[0] // .............................................................................*................|............................................................................*.............. - // mul v15.8h, v11.8h, v29.8h // .............................................................*................................|............................................................*.............................. - // sqrdmulh v11.8h, v11.8h, v30.8h // ...........................................................*..................................|..........................................................*................................ - // mls v15.8h, v11.8h, v7.h[0] // ....................................................................*.........................|...................................................................*....................... - // str q12, [x0], #(16) // .................................................................*............................|................................................................*.......................... - // str q13, [x0, #(-16 + 1*(512/8))] // ..............................................................*...............................|.............................................................*............................. - // str q14, [x0, #(-16 + 2*(512/8))] // .......................................................................................*......|......................................................................................*.... - // str q15, [x0, #(-16 + 3*(512/8))] // ............................................................................*.................|...........................................................................*............... + // ldr q8, [x0, #0] // ............e.........................................................................|.............e....................................................................... + // ldr q9, [x0, #(1*(512/8))] // ...............e......................................................................|................e.................................................................... + // ldr q10, [x0, #(2*(512/8))] // .....e................................................................................|......e.............................................................................. + // ldr q11, [x0, #(3*(512/8))] // ..................................e...................................................|...................................e................................................. + // ldr q12, [x0, #(4*(512/8))] // e.....................................................................................|.e................................................................................... + // ldr q13, [x0, #(5*(512/8))] // .e....................................................................................|..e.................................................................................. + // ldr q14, [x0, #(6*(512/8))] // ....e.................................................................................|.....e............................................................................... + // ldr q15, [x0, #(7*(512/8))] // .......e..............................................................................|........e............................................................................ + // sub v24.8h, v8.8h, v9.8h // ........................e.............................................................|.........................e........................................................... + // add v8.8h, v8.8h, v9.8h // ................................e.....................................................|.................................e................................................... + // mul v9.8h, v24.8h, v0.h[6] // ................................................e.....................................|.................................................e................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // ............................................................................e.........|.............................................................................e....... + // mls v9.8h, v24.8h, v7.h[0] // .....................................................................................e|..................................................................................... + // sub v24.8h, v10.8h, v11.8h // ...............................................................................e......|................................................................................e.... + // add v10.8h, v10.8h, v11.8h // ............................................e.........................................|.............................................e....................................... + // mul v11.8h, v24.8h, v1.h[0] // .....................*................................................................|......................*.............................................................. + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ......................*...............................................................|.......................*............................................................. + // mls v11.8h, v24.8h, v7.h[0] // ............................*.........................................................|.............................*....................................................... + // sub v24.8h, v12.8h, v13.8h // .............e........................................................................|..............e...................................................................... + // add v12.8h, v12.8h, v13.8h // ................e.....................................................................|.................e................................................................... + // mul v13.8h, v24.8h, v1.h[2] // ...................e..................................................................|....................e................................................................ + // sqrdmulh v24.8h, v24.8h, v1.h[3] // .................e....................................................................|..................e.................................................................. + // mls v13.8h, v24.8h, v7.h[0] // .................................e....................................................|..................................e.................................................. + // sub v24.8h, v14.8h, v15.8h // ..................e...................................................................|...................e................................................................. + // add v14.8h, v14.8h, v15.8h // .........................e............................................................|..........................e.......................................................... + // mul v15.8h, v24.8h, v1.h[4] // ..........................e...........................................................|...........................e......................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[5] // .............................e........................................................|..............................e...................................................... + // mls v15.8h, v24.8h, v7.h[0] // ......................................e...............................................|.......................................e............................................. + // sub v24.8h, v8.8h, v10.8h // ..*...................................................................................|...*................................................................................. + // add v8.8h, v8.8h, v10.8h // ..............................................................................e.......|...............................................................................e..... + // mul v10.8h, v24.8h, v0.h[2] // .........*............................................................................|..........*.......................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..........*...........................................................................|...........*......................................................................... + // mls v10.8h, v24.8h, v7.h[0] // .......................*..............................................................|........................*............................................................ + // sub v24.8h, v9.8h, v11.8h // ...................................*..................................................|....................................*................................................ + // add v9.8h, v9.8h, v11.8h // ....................................*.................................................|.....................................*............................................... + // mul v11.8h, v24.8h, v0.h[2] // .......................................*..............................................|........................................*............................................ + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .........................................*............................................|..........................................*.......................................... + // mls v11.8h, v24.8h, v7.h[0] // ...............................................*......................................|................................................*.................................... + // sub v24.8h, v12.8h, v14.8h // .........................................................................e............|..........................................................................e.......... + // add v12.8h, v12.8h, v14.8h // ........................................................e.............................|.........................................................e........................... + // mul v14.8h, v24.8h, v0.h[4] // ..................................................................................e...|...................................................................................e. + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ......................................................................................|*.................................................................................... + // mls v14.8h, v24.8h, v7.h[0] // ..............*.......................................................................|...............*..................................................................... + // sub v24.8h, v13.8h, v15.8h // ..................................................e...................................|...................................................e................................. + // add v13.8h, v13.8h, v15.8h // ...*..................................................................................|....*................................................................................ + // mul v15.8h, v24.8h, v0.h[4] // .......................................................e..............................|........................................................e............................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ........................................................................e.............|.........................................................................e........... + // mls v15.8h, v24.8h, v7.h[0] // ................................................................................e.....|.................................................................................e... + // sub v24.8h, v8.8h, v12.8h // ......................................................................................*..................................................................................... + // add v8.8h, v8.8h, v12.8h // ....................................................................................e.|..................................................................................... + // mul v12.8h, v24.8h, v0.h[0] // ........*.............................................................................|.........*........................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........*..........................................................................|............*........................................................................ + // mls v12.8h, v24.8h, v7.h[0] // ....................*.................................................................|.....................*............................................................... + // sub v24.8h, v9.8h, v13.8h // ..........................................*...........................................|...........................................*......................................... + // add v9.8h, v9.8h, v13.8h // ........................................*.............................................|.........................................*........................................... + // mul v13.8h, v24.8h, v0.h[0] // ..............................................*.......................................|...............................................*..................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .............................................*........................................|..............................................*...................................... + // mls v13.8h, v24.8h, v7.h[0] // ....................................................*.................................|.....................................................*............................... + // sub v24.8h, v10.8h, v14.8h // ..............................*.......................................................|...............................*..................................................... + // add v10.8h, v10.8h, v14.8h // ...............................*......................................................|................................*.................................................... + // mul v14.8h, v24.8h, v0.h[0] // ...................................................................*..................|....................................................................*................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .................................................................*....................|..................................................................*.................. + // mls v14.8h, v24.8h, v7.h[0] // ..........................................................................*...........|...........................................................................*......... + // sub v24.8h, v11.8h, v15.8h // .....................................................*................................|......................................................*.............................. + // add v11.8h, v11.8h, v15.8h // ......................................................*...............................|.......................................................*............................. + // mul v15.8h, v24.8h, v0.h[0] // ............................................................*.........................|.............................................................*....................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................................*.......................|...............................................................*..................... + // mls v15.8h, v24.8h, v7.h[0] // .....................................................................*................|......................................................................*.............. + // str q12, [x0, #(4*(512/8))] // ...........................*..........................................................|............................*........................................................ + // str q13, [x0, #(5*(512/8))] // .............................................................*........................|..............................................................*...................... + // str q14, [x0, #(6*(512/8))] // ...................................................................................*..|....................................................................................* + // str q15, [x0, #(7*(512/8))] // .............................................................................*........|..............................................................................*...... + // mul v12.8h, v8.8h, v29.8h // ......*...............................................................................|.......*............................................................................. + // sqrdmulh v8.8h, v8.8h, v30.8h // ...........................................................*..........................|............................................................*........................ + // mls v12.8h, v8.8h, v7.h[0] // ..................................................................*...................|...................................................................*................. + // mul v13.8h, v9.8h, v29.8h // ...........................................*..........................................|............................................*........................................ + // sqrdmulh v9.8h, v9.8h, v30.8h // .................................................*....................................|..................................................*.................................. + // mls v13.8h, v9.8h, v7.h[0] // ....................................................................*.................|.....................................................................*............... + // mul v14.8h, v10.8h, v29.8h // .....................................*................................................|......................................*.............................................. + // sqrdmulh v10.8h, v10.8h, v30.8h // ...................................................*..................................|....................................................*................................ + // mls v14.8h, v10.8h, v7.h[0] // ...............................................................*......................|................................................................*.................... + // mul v15.8h, v11.8h, v29.8h // .........................................................*............................|..........................................................*.......................... + // sqrdmulh v11.8h, v11.8h, v30.8h // ..........................................................*...........................|...........................................................*......................... + // mls v15.8h, v11.8h, v7.h[0] // ................................................................*.....................|.................................................................*................... + // str q12, [x0], #(16) // ...........................................................................*..........|............................................................................*........ + // str q13, [x0, #(-16 + 1*(512/8))] // .................................................................................*....|..................................................................................*.. + // str q14, [x0, #(-16 + 2*(512/8))] // ......................................................................*...............|.......................................................................*............. + // str q15, [x0, #(-16 + 3*(512/8))] // .......................................................................*..............|........................................................................*............ sub count, count, #1 cbnz count, layer123_start - mls v3.8H, v23.8H, v7.H[0] // *............................................................... - sub v2.8H, v11.8H, v15.8H // .*.............................................................. - // gap // ................................................................ - // gap // ................................................................ - mls v9.8H, v19.8H, v7.H[0] // ..*............................................................. - srshr v23.8H, v24.8H, #11 // ...*............................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v26.8H, v2.8H, v1.H[4] // ....*........................................................... - sqrdmulh v2.8H, v2.8H, v1.H[5] // .....*.......................................................... - // gap // ................................................................ - // gap // ................................................................ - sub v21.8H, v21.8H, v28.8H // ......*......................................................... - mls v10.8H, v20.8H, v7.H[0] // .......*........................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v8.8H, v23.8H, v7.H[0] // ........*....................................................... - mls v13.8H, v16.8H, v7.H[0] // .........*...................................................... - // gap // ................................................................ - // gap // ................................................................ - mls v26.8H, v2.8H, v7.H[0] // ..........*..................................................... - mul v2.8H, v21.8H, v0.H[4] // ...........*.................................................... - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v16.8H, v21.8H, v0.H[5] // ............*................................................... - sub v23.8H, v9.8H, v10.8H // .............*.................................................. - // gap // ................................................................ - // gap // ................................................................ - add v21.8H, v9.8H, v10.8H // ..............*................................................. - add v20.8H, v8.8H, v13.8H // ...............*................................................ - // gap // ................................................................ - // gap // ................................................................ - sub v17.8H, v31.8H, v26.8H // ....................*........................................... - add v26.8H, v31.8H, v26.8H // .................*.............................................. - // gap // ................................................................ - // gap // ................................................................ - mls v2.8H, v16.8H, v7.H[0] // ..................*............................................. - sub v16.8H, v8.8H, v13.8H // ................*............................................... - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v4.8H, v20.8H, v30.8H // ...................*............................................ - mul v20.8H, v20.8H, v29.8H // .....................*.......................................... - // gap // ................................................................ - // gap // ................................................................ - mul v25.8H, v16.8H, v0.H[0] // .......................*........................................ - mul v11.8H, v23.8H, v0.H[2] // ......................*......................................... - // gap // ................................................................ - // gap // ................................................................ - sub v13.8H, v3.8H, v2.8H // ..........................*..................................... - add v2.8H, v3.8H, v2.8H // .........................*...................................... - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v16.8H, v16.8H, v0.H[1] // ........................*....................................... - mls v20.8H, v4.8H, v7.H[0] // ...........................*.................................... - // gap // ................................................................ - // gap // ................................................................ - mul v4.8H, v17.8H, v0.H[4] // ............................*................................... - sqrdmulh v3.8H, v2.8H, v30.8H // .............................*.................................. - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v17.8H, v17.8H, v0.H[5] // ..............................*................................. - sqrdmulh v10.8H, v13.8H, v0.H[1] // ...............................*................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v23.8H, v23.8H, v0.H[3] // .................................*.............................. - mls v25.8H, v16.8H, v7.H[0] // ................................*............................... - str q20, [x0], #(16) // .................................................*.............. - // gap // ................................................................ - mul v16.8H, v13.8H, v0.H[0] // ..................................*............................. - sub v20.8H, v21.8H, v26.8H // ...................................*............................ - // gap // ................................................................ - // gap // ................................................................ - add v21.8H, v21.8H, v26.8H // ....................................*........................... - mls v4.8H, v17.8H, v7.H[0] // .....................................*.......................... - // gap // ................................................................ - // gap // ................................................................ - mls v11.8H, v23.8H, v7.H[0] // ......................................*......................... - mul v2.8H, v2.8H, v29.8H // ..........................................*..................... - str q25, [x0, #240] // ...........................................................*.... - // gap // ................................................................ - sqrdmulh v23.8H, v21.8H, v30.8H // .......................................*........................ - mul v21.8H, v21.8H, v29.8H // ........................................*....................... - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v26.8H, v20.8H, v0.H[1] // ....................................................*........... - mul v20.8H, v20.8H, v0.H[0] // ......................................................*......... - // gap // ................................................................ - // gap // ................................................................ - add v17.8H, v11.8H, v4.8H // ...........................................*.................... - sub v4.8H, v11.8H, v4.8H // .........................................*...................... - // gap // ................................................................ - // gap // ................................................................ - mls v2.8H, v3.8H, v7.H[0] // ........................................................*....... - mls v21.8H, v23.8H, v7.H[0] // ............................................*................... - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v23.8H, v17.8H, v30.8H // .............................................*.................. - mul v17.8H, v17.8H, v29.8H // ..............................................*................. - // gap // ................................................................ - // gap // ................................................................ - mul v25.8H, v4.8H, v0.H[0] // ................................................*............... - sqrdmulh v4.8H, v4.8H, v0.H[1] // ..................................................*............. - // gap // ................................................................ - // gap // ................................................................ - mls v16.8H, v10.8H, v7.H[0] // .........................................................*...... - mls v20.8H, v26.8H, v7.H[0] // ............................................................*... - str q21, [x0, #48] // ...............................................*................ - // gap // ................................................................ - str q2, [x0, #112] // .............................................................*.. - mls v17.8H, v23.8H, v7.H[0] // ...................................................*............ - // gap // ................................................................ - // gap // ................................................................ - mls v25.8H, v4.8H, v7.H[0] // .....................................................*.......... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - str q16, [x0, #368] // ..............................................................*. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - str q17, [x0, #176] // .......................................................*........ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - str q25, [x0, #432] // ..........................................................*..... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - str q20, [x0, #304] // ...............................................................* - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ + mul v25.8H, v21.8H, v1.H[0] // ...........*.......................................... + sqrdmulh v3.8H, v21.8H, v1.H[1] // ............*......................................... + // gap // ...................................................... + // gap // ...................................................... + sub v12.8H, v31.8H, v6.8H // *..................................................... + sub v23.8H, v9.8H, v26.8H // ..*................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v22.8H, v24.8H, v29.8H // ....*................................................. + sqrdmulh v28.8H, v19.8H, v0.H[5] // .*.................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v25.8H, v3.8H, v7.H[0] // ...............*...................................... + sqrdmulh v3.8H, v12.8H, v0.H[1] // ........*............................................. + // gap // ...................................................... + // gap // ...................................................... + mul v11.8H, v23.8H, v0.H[2] // ......*............................................... + sqrdmulh v4.8H, v23.8H, v0.H[3] // .......*.............................................. + // gap // ...................................................... + // gap // ...................................................... + add v27.8H, v13.8H, v2.8H // ...*.................................................. + sqrdmulh v18.8H, v24.8H, v30.8H // ....................................*................. + // gap // ...................................................... + // gap // ...................................................... + sub v10.8H, v15.8H, v25.8H // ..................*................................... + add v25.8H, v15.8H, v25.8H // ...................*.................................. + // gap // ...................................................... + // gap // ...................................................... + mls v11.8H, v4.8H, v7.H[0] // .............*........................................ + mls v5.8H, v28.8H, v7.H[0] // .........*............................................ + // gap // ...................................................... + // gap // ...................................................... + sub v21.8H, v25.8H, v27.8H // ........................*............................. + mls v22.8H, v18.8H, v7.H[0] // ...........................................*.......... + // gap // ...................................................... + // gap // ...................................................... + mul v13.8H, v10.8H, v0.H[2] // .....................*................................ + sqrdmulh v14.8H, v10.8H, v0.H[3] // .......................*.............................. + // gap // ...................................................... + // gap // ...................................................... + sub v9.8H, v11.8H, v5.8H // ................*..................................... + mul v24.8H, v21.8H, v0.H[0] // ...........................*.......................... + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v19.8H, v21.8H, v0.H[1] // ..........................*........................... + add v17.8H, v11.8H, v5.8H // .................*.................................... + str q22, [x0], #(16) // ..................................................*... + // gap // ...................................................... + mls v13.8H, v14.8H, v7.H[0] // ............................*......................... + mul v26.8H, v9.8H, v0.H[0] // ............................................*......... + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v4.8H, v9.8H, v0.H[1] // ..........................................*........... + mul v6.8H, v17.8H, v29.8H // ....................*................................. + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v5.8H, v17.8H, v30.8H // ..............................*....................... + mul v31.8H, v12.8H, v0.H[0] // .....*................................................ + // gap // ...................................................... + // gap // ...................................................... + add v11.8H, v13.8H, v16.8H // .................................*.................... + add v28.8H, v25.8H, v27.8H // ......................*............................... + // gap // ...................................................... + // gap // ...................................................... + sub v16.8H, v13.8H, v16.8H // ................................*..................... + mls v26.8H, v4.8H, v7.H[0] // .................................................*.... + // gap // ...................................................... + // gap // ...................................................... + mul v18.8H, v11.8H, v29.8H // ..................................*................... + sqrdmulh v10.8H, v11.8H, v30.8H // ...................................*.................. + // gap // ...................................................... + // gap // ...................................................... + mul v2.8H, v16.8H, v0.H[0] // .....................................*................ + sqrdmulh v11.8H, v16.8H, v0.H[1] // .......................................*.............. + // gap // ...................................................... + // gap // ...................................................... + str q26, [x0, #368] // .....................................................* + mls v31.8H, v3.8H, v7.H[0] // ..........*........................................... + sqrdmulh v8.8H, v28.8H, v30.8H // .............................*........................ + // gap // ...................................................... + mls v18.8H, v10.8H, v7.H[0] // .........................................*............ + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v20.8H, v28.8H, v29.8H // .........................*............................ + mls v2.8H, v11.8H, v7.H[0] // ..............................................*....... + // gap // ...................................................... + // gap // ...................................................... + mls v6.8H, v5.8H, v7.H[0] // ........................................*............. + str q31, [x0, #240] // ..............*....................................... + // gap // ...................................................... + // gap // ...................................................... + str q18, [x0, #176] // ................................................*..... + mls v24.8H, v19.8H, v7.H[0] // ...............................*...................... + // gap // ...................................................... + // gap // ...................................................... + mls v20.8H, v8.8H, v7.H[0] // .............................................*........ + str q2, [x0, #432] // ...................................................*.. + // gap // ...................................................... + // gap // ...................................................... + str q6, [x0, #112] // ...............................................*...... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + str q24, [x0, #304] // ......................................*............... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + str q20, [x0, #48] // ....................................................*. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... // original source code - // mls v3.8H, v23.8H, v7.H[0] // *............................................................... - // sub v5.8H, v11.8H, v15.8H // .*.............................................................. - // mls v9.8H, v19.8H, v7.H[0] // ..*............................................................. - // srshr v26.8H, v24.8H, #11 // ...*............................................................ - // mul v6.8H, v5.8H, v1.H[4] // ....*........................................................... - // sqrdmulh v2.8H, v5.8H, v1.H[5] // .....*.......................................................... - // sub v24.8H, v21.8H, v28.8H // ......*......................................................... - // mls v10.8H, v20.8H, v7.H[0] // .......*........................................................ - // mls v8.8H, v26.8H, v7.H[0] // ........*....................................................... - // mls v13.8H, v16.8H, v7.H[0] // .........*...................................................... - // mls v6.8H, v2.8H, v7.H[0] // ..........*..................................................... - // mul v16.8H, v24.8H, v0.H[4] // ...........*.................................................... - // sqrdmulh v19.8H, v24.8H, v0.H[5] // ............*................................................... - // sub v28.8H, v9.8H, v10.8H // .............*.................................................. - // add v20.8H, v9.8H, v10.8H // ..............*................................................. - // add v18.8H, v8.8H, v13.8H // ...............*................................................ - // sub v2.8H, v8.8H, v13.8H // ...................*............................................ - // add v26.8H, v31.8H, v6.8H // .................*.............................................. - // mls v16.8H, v19.8H, v7.H[0] // ..................*............................................. - // sqrdmulh v13.8H, v18.8H, v30.8H // ....................*........................................... - // sub v8.8H, v31.8H, v6.8H // ................*............................................... - // mul v31.8H, v18.8H, v29.8H // .....................*.......................................... - // mul v24.8H, v28.8H, v0.H[2] // .......................*........................................ - // mul v22.8H, v2.8H, v0.H[0] // ......................*......................................... - // sqrdmulh v27.8H, v2.8H, v0.H[1] // ..........................*..................................... - // add v2.8H, v3.8H, v16.8H // .........................*...................................... - // sub v25.8H, v3.8H, v16.8H // ........................*....................................... - // mls v31.8H, v13.8H, v7.H[0] // ...........................*.................................... - // mul v13.8H, v8.8H, v0.H[4] // ............................*................................... - // sqrdmulh v23.8H, v2.8H, v30.8H // .............................*.................................. - // sqrdmulh v8.8H, v8.8H, v0.H[5] // ..............................*................................. - // sqrdmulh v3.8H, v25.8H, v0.H[1] // ...............................*................................ - // mls v22.8H, v27.8H, v7.H[0] // .................................*.............................. - // sqrdmulh v19.8H, v28.8H, v0.H[3] // ................................*............................... - // mul v6.8H, v25.8H, v0.H[0] // ...................................*............................ - // sub v25.8H, v20.8H, v26.8H // ....................................*........................... - // add v26.8H, v20.8H, v26.8H // .....................................*.......................... - // mls v13.8H, v8.8H, v7.H[0] // ......................................*......................... - // mls v24.8H, v19.8H, v7.H[0] // .......................................*........................ - // sqrdmulh v16.8H, v26.8H, v30.8H // ..........................................*..................... - // mul v8.8H, v26.8H, v29.8H // ...........................................*.................... - // sub v19.8H, v24.8H, v13.8H // ...............................................*................ - // mul v2.8H, v2.8H, v29.8H // ........................................*....................... - // add v24.8H, v24.8H, v13.8H // ..............................................*................. - // mls v8.8H, v16.8H, v7.H[0] // .................................................*.............. - // sqrdmulh v20.8H, v24.8H, v30.8H // ..................................................*............. - // mul v24.8H, v24.8H, v29.8H // ...................................................*............ - // str q8, [x0, #64] // ........................................................*....... - // mul v14.8H, v19.8H, v0.H[0] // ....................................................*........... - // str q31, [x0], #(16) // ..................................*............................. - // sqrdmulh v16.8H, v19.8H, v0.H[1] // .....................................................*.......... - // mls v24.8H, v20.8H, v7.H[0] // ..........................................................*..... - // sqrdmulh v20.8H, v25.8H, v0.H[1] // ............................................*................... - // mls v14.8H, v16.8H, v7.H[0] // ...........................................................*.... - // mul v26.8H, v25.8H, v0.H[0] // .............................................*.................. - // str q24, [x0, #176] // .............................................................*.. - // mls v2.8H, v23.8H, v7.H[0] // ................................................*............... - // mls v6.8H, v3.8H, v7.H[0] // ......................................................*......... - // str q14, [x0, #432] // ..............................................................*. - // str q22, [x0, #240] // .........................................*...................... - // mls v26.8H, v20.8H, v7.H[0] // .......................................................*........ - // str q2, [x0, #112] // .........................................................*...... - // str q6, [x0, #368] // ............................................................*... - // str q26, [x0, #304] // ...............................................................* + // sub v27.8H, v31.8H, v6.8H // ..*................................................... + // sqrdmulh v22.8H, v19.8H, v0.H[5] // .....*................................................ + // sub v19.8H, v9.8H, v26.8H // ...*.................................................. + // add v26.8H, v13.8H, v2.8H // ..........*........................................... + // mul v20.8H, v24.8H, v29.8H // ....*................................................. + // mul v31.8H, v27.8H, v0.H[0] // ..............................*....................... + // mul v8.8H, v19.8H, v0.H[2] // ........*............................................. + // sqrdmulh v14.8H, v19.8H, v0.H[3] // .........*............................................ + // sqrdmulh v27.8H, v27.8H, v0.H[1] // .......*.............................................. + // mls v5.8H, v22.8H, v7.H[0] // ...............*...................................... + // mls v31.8H, v27.8H, v7.H[0] // ........................................*............. + // mul v27.8H, v21.8H, v1.H[0] // *..................................................... + // sqrdmulh v17.8H, v21.8H, v1.H[1] // .*.................................................... + // mls v8.8H, v14.8H, v7.H[0] // ..............*....................................... + // str q31, [x0, #256] // ..............................................*....... + // mls v27.8H, v17.8H, v7.H[0] // ......*............................................... + // sub v3.8H, v8.8H, v5.8H // ....................*................................. + // add v28.8H, v8.8H, v5.8H // .......................*.............................. + // sub v12.8H, v15.8H, v27.8H // ............*......................................... + // add v23.8H, v15.8H, v27.8H // .............*........................................ + // mul v27.8H, v28.8H, v29.8H // ............................*......................... + // mul v17.8H, v12.8H, v0.H[2] // ..................*................................... + // add v19.8H, v23.8H, v26.8H // ................................*..................... + // sqrdmulh v22.8H, v12.8H, v0.H[3] // ...................*.................................. + // sub v31.8H, v23.8H, v26.8H // ................*..................................... + // mul v4.8H, v19.8H, v29.8H // ...........................................*.......... + // sqrdmulh v6.8H, v31.8H, v0.H[1] // ......................*............................... + // mul v31.8H, v31.8H, v0.H[0] // .....................*................................ + // mls v17.8H, v22.8H, v7.H[0] // .........................*............................ + // sqrdmulh v8.8H, v19.8H, v30.8H // .........................................*............ + // sqrdmulh v23.8H, v28.8H, v30.8H // .............................*........................ + // mls v31.8H, v6.8H, v7.H[0] // ................................................*..... + // sub v12.8H, v17.8H, v16.8H // .................................*.................... + // add v22.8H, v17.8H, v16.8H // ...............................*...................... + // mul v19.8H, v22.8H, v29.8H // ...................................*.................. + // sqrdmulh v22.8H, v22.8H, v30.8H // ....................................*................. + // sqrdmulh v17.8H, v24.8H, v30.8H // ...........*.......................................... + // mul v24.8H, v12.8H, v0.H[0] // .....................................*................ + // str q31, [x0, #320] // ....................................................*. + // sqrdmulh v28.8H, v12.8H, v0.H[1] // ......................................*............... + // mls v27.8H, v23.8H, v7.H[0] // .............................................*........ + // mls v19.8H, v22.8H, v7.H[0] // ..........................................*........... + // sqrdmulh v14.8H, v3.8H, v0.H[1] // ...........................*.......................... + // mls v20.8H, v17.8H, v7.H[0] // .................*.................................... + // mul v12.8H, v3.8H, v0.H[0] // ..........................*........................... + // mls v4.8H, v8.8H, v7.H[0] // .................................................*.... + // mls v24.8H, v28.8H, v7.H[0] // ............................................*......... + // str q27, [x0, #128] // ...................................................*.. + // str q19, [x0, #192] // ...............................................*...... + // mls v12.8H, v14.8H, v7.H[0] // ..................................*................... + // str q20, [x0], #(16) // ........................*............................. + // str q24, [x0, #432] // ..................................................*... + // str q4, [x0, #48] // .....................................................* + // str q12, [x0, #368] // .......................................*.............. pop_stack diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_a55.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_a55.s index 3552189..a524ab5 100644 --- a/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_a55.s +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_a55.s @@ -354,534 +354,570 @@ _intt_kyber_123_4567_opt_a55: mov count, #8 .p2align 2 - ldr q8, [x1, #0] // *.......... - // gap // ........... - // gap // ........... - // gap // ........... - ldr q26, [x1, #16] // .*......... - // gap // ........... - // gap // ........... - // gap // ........... - ldr q28, [x1, #32] // ..*........ - // gap // ........... - // gap // ........... - // gap // ........... - ldr q18, [x1, #48] // ...*....... - // gap // ........... - // gap // ........... - // gap // ........... - ldr q5, [x4], #(6*16) // .....*..... - // gap // ........... - // gap // ........... - // gap // ........... - trn1 v19.4S, v28.4S, v18.4S // ....*...... - // gap // ........... - ldr q9, [x4, #-80] // ......*.... - // gap // ........... - // gap // ........... - // gap // ........... - ldr q17, [x4, #-64] // .......*... - // gap // ........... - // gap // ........... - // gap // ........... - ldr q4, [x4, #-48] // ........*.. - // gap // ........... - // gap // ........... - // gap // ........... - ldr q21, [x4, #-32] // .........*. - // gap // ........... - // gap // ........... - // gap // ........... - ldr q1, [x4, #-16] // ..........* - // gap // ........... + ldr q23, [x1, #0] // *.......... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q27, [x1, #16] // .*......... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q3, [x1, #32] // ..*........ + // gap // ........... + // gap // ........... + // gap // ........... + ldr q28, [x1, #48] // ...*....... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q29, [x4], #(6*16) // .....*..... + // gap // ........... + // gap // ........... + // gap // ........... + trn1 v26.4S, v3.4S, v28.4S // ....*...... + // gap // ........... + ldr q20, [x4, #-80] // ......*.... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q24, [x4, #-64] // .......*... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q11, [x4, #-48] // ........*.. + // gap // ........... + // gap // ........... + // gap // ........... + ldr q14, [x4, #-32] // .........*. + // gap // ........... + // gap // ........... + // gap // ........... + ldr q25, [x4, #-16] // ..........* + // gap // ........... // original source code - // ldr q8, [x1, #0] // *.......... - // ldr q26, [x1, #16] // .*......... - // ldr q28, [x1, #32] // ..*........ - // ldr q18, [x1, #48] // ...*....... - // trn1 v19.4S, v28.4S, v18.4S // .....*..... - // ldr q5, [x4], #(6*16) // ....*...... - // ldr q9, [x4, #-80] // ......*.... - // ldr q17, [x4, #-64] // .......*... - // ldr q4, [x4, #-48] // ........*.. - // ldr q21, [x4, #-32] // .........*. - // ldr q1, [x4, #-16] // ..........* + // ldr q23, [x1, #0] // *.......... + // ldr q27, [x1, #16] // .*......... + // ldr q3, [x1, #32] // ..*........ + // ldr q28, [x1, #48] // ...*....... + // trn1 v26.4S, v3.4S, v28.4S // .....*..... + // ldr q29, [x4], #(6*16) // ....*...... + // ldr q20, [x4, #-80] // ......*.... + // ldr q24, [x4, #-64] // .......*... + // ldr q11, [x4, #-48] // ........*.. + // ldr q14, [x4, #-32] // .........*. + // ldr q25, [x4, #-16] // ..........* sub count, count, #1 layer4567_start: - trn1 v23.4S, v8.4S, v26.4S // ....*........................................................................ - // gap // ............................................................................. - trn2 v8.4S, v8.4S, v26.4S // .....*....................................................................... - // gap // ............................................................................. - trn2 v12.4S, v28.4S, v18.4S // .......*..................................................................... - // gap // ............................................................................. - trn2 v18.2D, v23.2D, v19.2D // ........*.................................................................... - // gap // ............................................................................. - trn1 v23.2D, v23.2D, v19.2D // ..........*.................................................................. - // gap // ............................................................................. - trn2 v26.2D, v8.2D, v12.2D // .........*................................................................... - // gap // ............................................................................. - trn1 v8.2D, v8.2D, v12.2D // ...........*................................................................. - // gap // ............................................................................. - sub v12.8H, v18.8H, v26.8H // .......................*..................................................... - // gap // ............................................................................. - add v18.8H, v18.8H, v26.8H // ........................*.................................................... - // gap // ............................................................................. - sub v26.8H, v23.8H, v8.8H // ..................*.......................................................... - // gap // ............................................................................. - add v23.8H, v23.8H, v8.8H // ...................*......................................................... - // gap // ............................................................................. - mul v8.8H, v12.8H, v21.8H // .........................*................................................... - // gap // ............................................................................. - mul v17.8H, v26.8H, v17.8H // ....................*........................................................ - // gap // ............................................................................. - sqrdmulh v26.8H, v26.8H, v4.8H // .....................*....................................................... - // gap // ............................................................................. - sqrdmulh v12.8H, v12.8H, v1.8H // ..........................*.................................................. - // gap // ............................................................................. - sub v28.8H, v23.8H, v18.8H // ............................*................................................ - // gap // ............................................................................. - add v23.8H, v23.8H, v18.8H // .............................*............................................... - // gap // ............................................................................. - mls v17.8H, v26.8H, v7.H[0] // ......................*...................................................... - // gap // ............................................................................. - mls v8.8H, v12.8H, v7.H[0] // ...........................*................................................. - // gap // ............................................................................. - mul v12.8H, v28.8H, v5.8H // ..............................*.............................................. - // gap // ............................................................................. - sqrdmulh v18.8H, v28.8H, v9.8H // ...............................*............................................. - // gap // ............................................................................. - ldr q4, [x3], #16 // ..............................................*.............................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - sub v26.8H, v17.8H, v8.8H // .................................*........................................... - // gap // ............................................................................. - mls v12.8H, v18.8H, v7.H[0] // ................................*............................................ - // gap // ............................................................................. - add v8.8H, v17.8H, v8.8H // ..................................*.......................................... - // gap // ............................................................................. - mul v18.8H, v26.8H, v5.8H // ...................................*......................................... - // gap // ............................................................................. - sqrdmulh v26.8H, v26.8H, v9.8H // ....................................*........................................ - // gap // ............................................................................. - trn1 v17.4S, v23.4S, v8.4S // ......................................*...................................... - // gap // ............................................................................. - trn2 v23.4S, v23.4S, v8.4S // .......................................*..................................... - // gap // ............................................................................. - ldr q8, [x1, #64] // e............................................................................ - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - mls v18.8H, v26.8H, v7.H[0] // .....................................*....................................... - // gap // ............................................................................. - ldr q26, [x1, #80] // .e........................................................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - ldr q28, [x1, #96] // ..e.......................................................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - trn1 v19.4S, v12.4S, v18.4S // ........................................*.................................... - // gap // ............................................................................. - trn2 v12.4S, v12.4S, v18.4S // .........................................*................................... - // gap // ............................................................................. - ldr q18, [x1, #112] // ...e......................................................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - trn2 v5.2D, v17.2D, v19.2D // ..........................................*.................................. - // gap // ............................................................................. - trn2 v9.2D, v23.2D, v12.2D // ...........................................*................................. - // gap // ............................................................................. - trn1 v17.2D, v17.2D, v19.2D // ............................................*................................ - // gap // ............................................................................. - trn1 v23.2D, v23.2D, v12.2D // .............................................*............................... - // gap // ............................................................................. - sub v12.8H, v5.8H, v9.8H // ....................................................*........................ - // gap // ............................................................................. - sub v19.8H, v17.8H, v23.8H // ...............................................*............................. - // gap // ............................................................................. - add v23.8H, v17.8H, v23.8H // ................................................*............................ - // gap // ............................................................................. - add v17.8H, v5.8H, v9.8H // .....................................................*....................... - // gap // ............................................................................. - mul v5.8H, v19.8H, v4.H[2] // .................................................*........................... - // gap // ............................................................................. - sqrdmulh v19.8H, v19.8H, v4.H[3] // ..................................................*.......................... - // gap // ............................................................................. - mul v9.8H, v12.8H, v4.H[4] // ......................................................*...................... - // gap // ............................................................................. - sqdmulh v21.8H, v23.8H, v7.H[1] // .........................................................*................... - // gap // ............................................................................. - sqdmulh v1.8H, v17.8H, v7.H[1] // ............................................................*................ - // gap // ............................................................................. - sqrdmulh v12.8H, v12.8H, v4.H[5] // .......................................................*..................... - // gap // ............................................................................. - mls v5.8H, v19.8H, v7.H[0] // ...................................................*......................... - // gap // ............................................................................. - srshr v19.8H, v21.8H, #11 // ..........................................................*.................. - // gap // ............................................................................. - srshr v21.8H, v1.8H, #11 // .............................................................*............... - // gap // ............................................................................. - mls v9.8H, v12.8H, v7.H[0] // ........................................................*.................... - // gap // ............................................................................. - mls v23.8H, v19.8H, v7.H[0] // ...........................................................*................. - // gap // ............................................................................. - mls v17.8H, v21.8H, v7.H[0] // ..............................................................*.............. - // gap // ............................................................................. - trn1 v19.4S, v28.4S, v18.4S // ......e...................................................................... - // gap // ............................................................................. - sub v12.8H, v5.8H, v9.8H // ....................................................................*........ - // gap // ............................................................................. - add v5.8H, v5.8H, v9.8H // .....................................................................*....... - // gap // ............................................................................. - sub v9.8H, v23.8H, v17.8H // ...............................................................*............. - // gap // ............................................................................. - mul v21.8H, v12.8H, v4.H[0] // ......................................................................*...... - // gap // ............................................................................. - sqrdmulh v12.8H, v12.8H, v4.H[1] // .......................................................................*..... - // gap // ............................................................................. - mul v1.8H, v9.8H, v4.H[0] // .................................................................*........... - // gap // ............................................................................. - sqrdmulh v4.8H, v9.8H, v4.H[1] // ..................................................................*.......... - // gap // ............................................................................. - add v23.8H, v23.8H, v17.8H // ................................................................*............ - // gap // ............................................................................. - mls v21.8H, v12.8H, v7.H[0] // ........................................................................*.... - // gap // ............................................................................. - str q5, [x1, #16] // ..........................................................................*.. - // gap // ............................................................................. - mls v1.8H, v4.8H, v7.H[0] // ...................................................................*......... - // gap // ............................................................................. - str q23, [x1], #(64) // .........................................................................*... - // gap // ............................................................................. - ldr q5, [x4], #(6*16) // ............e................................................................ - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - str q1, [x1, #-32] // ...........................................................................*. - // gap // ............................................................................. - ldr q9, [x4, #-80] // .............e............................................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - str q21, [x1, #-16] // ............................................................................* - // gap // ............................................................................. - ldr q17, [x4, #-64] // ..............e.............................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - ldr q4, [x4, #-48] // ...............e............................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - ldr q21, [x4, #-32] // ................e............................................................ - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - ldr q1, [x4, #-16] // .................e........................................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. + trn1 v19.4S, v23.4S, v27.4S // ....*.............................................................................. + // gap // ................................................................................... + trn2 v23.4S, v23.4S, v27.4S // .....*............................................................................. + // gap // ................................................................................... + trn2 v22.4S, v3.4S, v28.4S // .......*........................................................................... + // gap // ................................................................................... + trn2 v28.2D, v19.2D, v26.2D // ........*.......................................................................... + // gap // ................................................................................... + trn1 v19.2D, v19.2D, v26.2D // ..........*........................................................................ + // gap // ................................................................................... + trn2 v0.2D, v23.2D, v22.2D // .........*......................................................................... + // gap // ................................................................................... + trn1 v23.2D, v23.2D, v22.2D // ...........*....................................................................... + // gap // ................................................................................... + sub v22.8H, v28.8H, v0.8H // .......................*........................................................... + // gap // ................................................................................... + add v28.8H, v28.8H, v0.8H // ........................*.......................................................... + // gap // ................................................................................... + sub v0.8H, v19.8H, v23.8H // ..................*................................................................ + // gap // ................................................................................... + add v19.8H, v19.8H, v23.8H // ...................*............................................................... + // gap // ................................................................................... + mul v23.8H, v22.8H, v14.8H // .........................*......................................................... + // gap // ................................................................................... + mul v27.8H, v0.8H, v24.8H // ....................*.............................................................. + // gap // ................................................................................... + sqrdmulh v0.8H, v0.8H, v11.8H // .....................*............................................................. + // gap // ................................................................................... + sqrdmulh v22.8H, v22.8H, v25.8H // ..........................*........................................................ + // gap // ................................................................................... + sub v24.8H, v19.8H, v28.8H // ............................*...................................................... + // gap // ................................................................................... + add v19.8H, v19.8H, v28.8H // .............................*..................................................... + // gap // ................................................................................... + mls v27.8H, v0.8H, v7.H[0] // ......................*............................................................ + // gap // ................................................................................... + mls v23.8H, v22.8H, v7.H[0] // ...........................*....................................................... + // gap // ................................................................................... + mul v22.8H, v24.8H, v29.8H // ..............................*.................................................... + // gap // ................................................................................... + sqrdmulh v28.8H, v24.8H, v20.8H // ...............................*................................................... + // gap // ................................................................................... + ldr q0, [x3], #16 // ..............................................*.................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v24.8H, v27.8H, v23.8H // .................................*................................................. + // gap // ................................................................................... + mls v22.8H, v28.8H, v7.H[0] // ................................*.................................................. + // gap // ................................................................................... + add v23.8H, v27.8H, v23.8H // ..................................*................................................ + // gap // ................................................................................... + mul v28.8H, v24.8H, v29.8H // ...................................*............................................... + // gap // ................................................................................... + sqrdmulh v27.8H, v24.8H, v20.8H // ....................................*.............................................. + // gap // ................................................................................... + trn1 v24.4S, v19.4S, v23.4S // ......................................*............................................ + // gap // ................................................................................... + trn2 v19.4S, v19.4S, v23.4S // .......................................*........................................... + // gap // ................................................................................... + ldr q23, [x1, #64] // e.................................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v28.8H, v27.8H, v7.H[0] // .....................................*............................................. + // gap // ................................................................................... + ldr q27, [x1, #80] // .e................................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + ldr q3, [x1, #96] // ..e................................................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v26.4S, v22.4S, v28.4S // ........................................*.......................................... + // gap // ................................................................................... + trn2 v22.4S, v22.4S, v28.4S // .........................................*......................................... + // gap // ................................................................................... + ldr q28, [x1, #112] // ...e............................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn2 v29.2D, v24.2D, v26.2D // ..........................................*........................................ + // gap // ................................................................................... + trn2 v20.2D, v19.2D, v22.2D // ...........................................*....................................... + // gap // ................................................................................... + trn1 v24.2D, v24.2D, v26.2D // ............................................*...................................... + // gap // ................................................................................... + trn1 v19.2D, v19.2D, v22.2D // .............................................*..................................... + // gap // ................................................................................... + sub v22.8H, v29.8H, v20.8H // ....................................................*.............................. + // gap // ................................................................................... + sub v26.8H, v24.8H, v19.8H // ...............................................*................................... + // gap // ................................................................................... + add v19.8H, v24.8H, v19.8H // ................................................*.................................. + // gap // ................................................................................... + mul v24.8H, v22.8H, v0.H[4] // ......................................................*............................ + // gap // ................................................................................... + mul v11.8H, v26.8H, v0.H[2] // .................................................*................................. + // gap // ................................................................................... + sqrdmulh v26.8H, v26.8H, v0.H[3] // ..................................................*................................ + // gap // ................................................................................... + sqrdmulh v22.8H, v22.8H, v0.H[5] // .......................................................*........................... + // gap // ................................................................................... + add v29.8H, v29.8H, v20.8H // .....................................................*............................. + // gap // ................................................................................... + sqdmulh v20.8H, v19.8H, v7.H[1] // .........................................................*......................... + // gap // ................................................................................... + mls v11.8H, v26.8H, v7.H[0] // ...................................................*............................... + // gap // ................................................................................... + mls v24.8H, v22.8H, v7.H[0] // ........................................................*.......................... + // gap // ................................................................................... + sqdmulh v22.8H, v29.8H, v7.H[1] // ............................................................*...................... + // gap // ................................................................................... + srshr v26.8H, v20.8H, #11 // ..........................................................*........................ + // gap // ................................................................................... + sqdmulh v20.8H, v11.8H, v7.H[1] // ...............................................................*................... + // gap // ................................................................................... + sqdmulh v14.8H, v24.8H, v7.H[1] // ..................................................................*................ + // gap // ................................................................................... + mls v19.8H, v26.8H, v7.H[0] // ...........................................................*....................... + // gap // ................................................................................... + srshr v22.8H, v22.8H, #11 // .............................................................*..................... + // gap // ................................................................................... + srshr v26.8H, v20.8H, #11 // ................................................................*.................. + // gap // ................................................................................... + srshr v20.8H, v14.8H, #11 // ...................................................................*............... + // gap // ................................................................................... + mls v29.8H, v22.8H, v7.H[0] // ..............................................................*.................... + // gap // ................................................................................... + mls v11.8H, v26.8H, v7.H[0] // .................................................................*................. + // gap // ................................................................................... + mls v24.8H, v20.8H, v7.H[0] // ....................................................................*.............. + // gap // ................................................................................... + trn1 v26.4S, v3.4S, v28.4S // ......e............................................................................ + // gap // ................................................................................... + sub v22.8H, v19.8H, v29.8H // .....................................................................*............. + // gap // ................................................................................... + add v19.8H, v19.8H, v29.8H // ......................................................................*............ + // gap // ................................................................................... + sub v29.8H, v11.8H, v24.8H // ..........................................................................*........ + // gap // ................................................................................... + mul v20.8H, v22.8H, v0.H[0] // .......................................................................*........... + // gap // ................................................................................... + sqrdmulh v22.8H, v22.8H, v0.H[1] // ........................................................................*.......... + // gap // ................................................................................... + mul v14.8H, v29.8H, v0.H[0] // ............................................................................*...... + // gap // ................................................................................... + sqrdmulh v0.8H, v29.8H, v0.H[1] // .............................................................................*..... + // gap // ................................................................................... + add v24.8H, v11.8H, v24.8H // ...........................................................................*....... + // gap // ................................................................................... + mls v20.8H, v22.8H, v7.H[0] // .........................................................................*......... + // gap // ................................................................................... + str q19, [x1], #(64) // ...............................................................................*... + // gap // ................................................................................... + mls v14.8H, v0.8H, v7.H[0] // ..............................................................................*.... + // gap // ................................................................................... + str q24, [x1, #-48] // ................................................................................*.. + // gap // ................................................................................... + ldr q29, [x4], #(6*16) // ............e...................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q20, [x1, #-32] // .................................................................................*. + // gap // ................................................................................... + ldr q20, [x4, #-80] // .............e..................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q14, [x1, #-16] // ..................................................................................* + // gap // ................................................................................... + ldr q24, [x4, #-64] // ..............e.................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + ldr q11, [x4, #-48] // ...............e................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + ldr q14, [x4, #-32] // ................e.................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + ldr q25, [x4, #-16] // .................e................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... // original source code - // ldr q8, [x1, #(16*0)] // e...............................................|............................e........................................... - // ldr q9, [x1, #(16*1)] // ..e.............................................|..............................e......................................... - // ldr q10, [x1, #(16*2)] // ...e............................................|...............................e........................................ - // ldr q11, [x1, #(16*3)] // ......e.........................................|..................................e..................................... - // trn1 v25.4s, v8.4s, v9.4s // ................................................*........................................................................ - // trn2 v26.4s, v8.4s, v9.4s // ................................................|*....................................................................... - // trn1 v27.4s, v10.4s, v11.4s // ...........................e....................|.......................................................e................ - // trn2 v28.4s, v10.4s, v11.4s // ................................................|.*...................................................................... - // trn2 v10.2d, v25.2d, v27.2d // ................................................|..*..................................................................... - // trn2 v11.2d, v26.2d, v28.2d // ................................................|....*................................................................... - // trn1 v8.2d, v25.2d, v27.2d // ................................................|...*.................................................................... - // trn1 v9.2d, v26.2d, v28.2d // ................................................|.....*.................................................................. - // ldr q0, [x4], #(6*16) // ........................................e.......|....................................................................e... - // ldr q4, [x4, #(-6*16 + 1*16)] // ..........................................e.....|......................................................................e. - // ldr q1, [x4, #(-6*16 + 2*16)] // ............................................e...|........................................................................ - // ldr q5, [x4, #(-6*16 + 3*16)] // .............................................e..|........................................................................ - // ldr q2, [x4, #(-6*16 + 4*16)] // ..............................................e.|........................................................................ - // ldr q6, [x4, #(-6*16 + 5*16)] // ...............................................e|........................................................................ - // sub v24.8h, v8.8h, v9.8h // ................................................|........*............................................................... - // add v8.8h, v8.8h, v9.8h // ................................................|.........*.............................................................. - // mul v9.8h, v24.8h, v1.8h // ................................................|...........*............................................................ - // sqrdmulh v24.8h, v24.8h, v5.8h // ................................................|............*........................................................... - // mls v9.8h, v24.8h, v7.h[0] // ................................................|................*....................................................... - // sub v24.8h, v10.8h, v11.8h // ................................................|......*................................................................. - // add v10.8h, v10.8h, v11.8h // ................................................|.......*................................................................ - // mul v11.8h, v24.8h, v2.8h // ................................................|..........*............................................................. - // sqrdmulh v24.8h, v24.8h, v6.8h // ................................................|.............*.......................................................... - // mls v11.8h, v24.8h, v7.h[0] // ................................................|.................*...................................................... - // sub v24.8h, v8.8h, v10.8h // ................................................|..............*......................................................... - // add v8.8h, v8.8h, v10.8h // ................................................|...............*........................................................ - // mul v10.8h, v24.8h, v0.8h // ................................................|..................*..................................................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ................................................|...................*.................................................... - // mls v10.8h, v24.8h, v7.h[0] // ................................................|......................*................................................. - // sub v24.8h, v9.8h, v11.8h // ................................................|.....................*.................................................. - // add v9.8h, v9.8h, v11.8h // ................................................|.......................*................................................ - // mul v11.8h, v24.8h, v0.8h // ................................................|........................*............................................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ................................................|.........................*.............................................. - // mls v11.8h, v24.8h, v7.h[0] // .*..............................................|.............................*.......................................... - // trn1 v25.4s, v8.4s, v9.4s // ................................................|..........................*............................................. - // trn2 v26.4s, v8.4s, v9.4s // ................................................|...........................*............................................ - // trn1 v27.4s, v10.4s, v11.4s // ....*...........................................|................................*....................................... - // trn2 v28.4s, v10.4s, v11.4s // .....*..........................................|.................................*...................................... - // trn2 v10.2d, v25.2d, v27.2d // .......*........................................|...................................*.................................... - // trn2 v11.2d, v26.2d, v28.2d // ........*.......................................|....................................*................................... - // trn1 v8.2d, v25.2d, v27.2d // .........*......................................|.....................................*.................................. - // trn1 v9.2d, v26.2d, v28.2d // ..........*.....................................|......................................*................................. - // ldr q0, [x3], #16 // ................................................|....................*................................................... - // sub v24.8h, v8.8h, v9.8h // ............*...................................|........................................*............................... - // add v8.8h, v8.8h, v9.8h // .............*..................................|.........................................*.............................. - // mul v9.8h, v24.8h, v0.h[2] // ...............*................................|...........................................*............................ - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ................*...............................|............................................*........................... - // mls v9.8h, v24.8h, v7.h[0] // .....................*..........................|.................................................*...................... - // sub v24.8h, v10.8h, v11.8h // ...........*....................................|.......................................*................................ - // add v10.8h, v10.8h, v11.8h // ..............*.................................|..........................................*............................. - // mul v11.8h, v24.8h, v0.h[4] // .................*..............................|.............................................*.......................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ....................*...........................|................................................*....................... - // mls v11.8h, v24.8h, v7.h[0] // ........................*.......................|....................................................*................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // ..................*.............................|..............................................*......................... - // srshr v25.8h, v25.8h, #11 // ......................*.........................|..................................................*..................... - // mls v8.8h, v25.8h, v7.h[0] // .........................*......................|.....................................................*.................. - // sqdmulh v25.8h, v10.8h, v7.h[1] // ...................*............................|...............................................*........................ - // srshr v25.8h, v25.8h, #11 // .......................*........................|...................................................*.................... - // mls v10.8h, v25.8h, v7.h[0] // ..........................*.....................|......................................................*................. - // sub v24.8h, v8.8h, v10.8h // ..............................*.................|..........................................................*............. - // add v8.8h, v8.8h, v10.8h // ...................................*............|...............................................................*........ - // mul v10.8h, v24.8h, v0.h[0] // .................................*..............|.............................................................*.......... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..................................*.............|..............................................................*......... - // mls v10.8h, v24.8h, v7.h[0] // ......................................*.........|..................................................................*..... - // sub v24.8h, v9.8h, v11.8h // ............................*...................|........................................................*............... - // add v9.8h, v9.8h, v11.8h // .............................*..................|.........................................................*.............. - // mul v11.8h, v24.8h, v0.h[0] // ...............................*................|...........................................................*............ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ................................*...............|............................................................*........... - // mls v11.8h, v24.8h, v7.h[0] // ....................................*...........|................................................................*....... - // str q8, [x1], #(64) // .......................................*........|...................................................................*.... - // str q9, [x1, #(-64 + 16*1)] // .....................................*..........|.................................................................*...... - // str q10, [x1, #(-64 + 16*2)] // .........................................*......|.....................................................................*.. - // str q11, [x1, #(-64 + 16*3)] // ...........................................*....|.......................................................................* + // ldr q8, [x1, #(16*0)] // e.....................................................|............................e................................................. + // ldr q9, [x1, #(16*1)] // ..e...................................................|..............................e............................................... + // ldr q10, [x1, #(16*2)] // ...e..................................................|...............................e.............................................. + // ldr q11, [x1, #(16*3)] // ......e...............................................|..................................e........................................... + // trn1 v25.4s, v8.4s, v9.4s // ......................................................*.............................................................................. + // trn2 v26.4s, v8.4s, v9.4s // ......................................................|*............................................................................. + // trn1 v27.4s, v10.4s, v11.4s // .................................e....................|.............................................................e................ + // trn2 v28.4s, v10.4s, v11.4s // ......................................................|.*............................................................................ + // trn2 v10.2d, v25.2d, v27.2d // ......................................................|..*........................................................................... + // trn2 v11.2d, v26.2d, v28.2d // ......................................................|....*......................................................................... + // trn1 v8.2d, v25.2d, v27.2d // ......................................................|...*.......................................................................... + // trn1 v9.2d, v26.2d, v28.2d // ......................................................|.....*........................................................................ + // ldr q0, [x4], #(6*16) // ..............................................e.......|..........................................................................e... + // ldr q4, [x4, #(-6*16 + 1*16)] // ................................................e.....|............................................................................e. + // ldr q1, [x4, #(-6*16 + 2*16)] // ..................................................e...|.............................................................................. + // ldr q5, [x4, #(-6*16 + 3*16)] // ...................................................e..|.............................................................................. + // ldr q2, [x4, #(-6*16 + 4*16)] // ....................................................e.|.............................................................................. + // ldr q6, [x4, #(-6*16 + 5*16)] // .....................................................e|.............................................................................. + // sub v24.8h, v8.8h, v9.8h // ......................................................|........*..................................................................... + // add v8.8h, v8.8h, v9.8h // ......................................................|.........*.................................................................... + // mul v9.8h, v24.8h, v1.8h // ......................................................|...........*.................................................................. + // sqrdmulh v24.8h, v24.8h, v5.8h // ......................................................|............*................................................................. + // mls v9.8h, v24.8h, v7.h[0] // ......................................................|................*............................................................. + // sub v24.8h, v10.8h, v11.8h // ......................................................|......*....................................................................... + // add v10.8h, v10.8h, v11.8h // ......................................................|.......*...................................................................... + // mul v11.8h, v24.8h, v2.8h // ......................................................|..........*................................................................... + // sqrdmulh v24.8h, v24.8h, v6.8h // ......................................................|.............*................................................................ + // mls v11.8h, v24.8h, v7.h[0] // ......................................................|.................*............................................................ + // sub v24.8h, v8.8h, v10.8h // ......................................................|..............*............................................................... + // add v8.8h, v8.8h, v10.8h // ......................................................|...............*.............................................................. + // mul v10.8h, v24.8h, v0.8h // ......................................................|..................*........................................................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ......................................................|...................*.......................................................... + // mls v10.8h, v24.8h, v7.h[0] // ......................................................|......................*....................................................... + // sub v24.8h, v9.8h, v11.8h // ......................................................|.....................*........................................................ + // add v9.8h, v9.8h, v11.8h // ......................................................|.......................*...................................................... + // mul v11.8h, v24.8h, v0.8h // ......................................................|........................*..................................................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ......................................................|.........................*.................................................... + // mls v11.8h, v24.8h, v7.h[0] // .*....................................................|.............................*................................................ + // trn1 v25.4s, v8.4s, v9.4s // ......................................................|..........................*................................................... + // trn2 v26.4s, v8.4s, v9.4s // ......................................................|...........................*.................................................. + // trn1 v27.4s, v10.4s, v11.4s // ....*.................................................|................................*............................................. + // trn2 v28.4s, v10.4s, v11.4s // .....*................................................|.................................*............................................ + // trn2 v10.2d, v25.2d, v27.2d // .......*..............................................|...................................*.......................................... + // trn2 v11.2d, v26.2d, v28.2d // ........*.............................................|....................................*......................................... + // trn1 v8.2d, v25.2d, v27.2d // .........*............................................|.....................................*........................................ + // trn1 v9.2d, v26.2d, v28.2d // ..........*...........................................|......................................*....................................... + // ldr q0, [x3], #16 // ......................................................|....................*......................................................... + // sub v24.8h, v8.8h, v9.8h // ............*.........................................|........................................*..................................... + // add v8.8h, v8.8h, v9.8h // .............*........................................|.........................................*.................................... + // mul v9.8h, v24.8h, v0.h[2] // ...............*......................................|...........................................*.................................. + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ................*.....................................|............................................*................................. + // mls v9.8h, v24.8h, v7.h[0] // ....................*.................................|................................................*............................. + // sub v24.8h, v10.8h, v11.8h // ...........*..........................................|.......................................*...................................... + // add v10.8h, v10.8h, v11.8h // ..................*...................................|..............................................*............................... + // mul v11.8h, v24.8h, v0.h[4] // ..............*.......................................|..........................................*................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .................*....................................|.............................................*................................ + // mls v11.8h, v24.8h, v7.h[0] // .....................*................................|.................................................*............................ + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...................*..................................|...............................................*.............................. + // srshr v25.8h, v25.8h, #11 // .......................*..............................|...................................................*.......................... + // mls v8.8h, v25.8h, v7.h[0] // ..........................*...........................|......................................................*....................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ......................*...............................|..................................................*........................... + // srshr v25.8h, v25.8h, #11 // ...........................*..........................|.......................................................*...................... + // mls v10.8h, v25.8h, v7.h[0] // ..............................*.......................|..........................................................*................... + // sqdmulh v25.8h, v9.8h, v7.h[1] // ........................*.............................|....................................................*......................... + // srshr v25.8h, v25.8h, #11 // ............................*.........................|........................................................*..................... + // mls v9.8h, v25.8h, v7.h[0] // ...............................*......................|...........................................................*.................. + // sqdmulh v25.8h, v11.8h, v7.h[1] // .........................*............................|.....................................................*........................ + // srshr v25.8h, v25.8h, #11 // .............................*........................|.........................................................*.................... + // mls v11.8h, v25.8h, v7.h[0] // ................................*.....................|............................................................*................. + // sub v24.8h, v8.8h, v10.8h // ..................................*...................|..............................................................*............... + // add v8.8h, v8.8h, v10.8h // ...................................*..................|...............................................................*.............. + // mul v10.8h, v24.8h, v0.h[0] // .....................................*................|.................................................................*............ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ......................................*...............|..................................................................*........... + // mls v10.8h, v24.8h, v7.h[0] // ..........................................*...........|......................................................................*....... + // sub v24.8h, v9.8h, v11.8h // ....................................*.................|................................................................*............. + // add v9.8h, v9.8h, v11.8h // .........................................*............|.....................................................................*........ + // mul v11.8h, v24.8h, v0.h[0] // .......................................*..............|...................................................................*.......... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................................*.............|....................................................................*......... + // mls v11.8h, v24.8h, v7.h[0] // ............................................*.........|........................................................................*..... + // str q8, [x1], #(64) // ...........................................*..........|.......................................................................*...... + // str q9, [x1, #(-64 + 16*1)] // .............................................*........|.........................................................................*.... + // str q10, [x1, #(-64 + 16*2)] // ...............................................*......|...........................................................................*.. + // str q11, [x1, #(-64 + 16*3)] // .................................................*....|.............................................................................* sub count, count, #1 cbnz count, layer4567_start - trn1 v3.4S, v8.4S, v26.4S // *................................................................. - // gap // .................................................................. - trn2 v26.4S, v8.4S, v26.4S // .*................................................................ - // gap // .................................................................. - trn2 v8.4S, v28.4S, v18.4S // ..*............................................................... - // gap // .................................................................. - trn2 v29.2D, v3.2D, v19.2D // ...*.............................................................. - // gap // .................................................................. - trn1 v18.2D, v3.2D, v19.2D // ....*............................................................. - // gap // .................................................................. - trn2 v23.2D, v26.2D, v8.2D // .....*............................................................ - // gap // .................................................................. - trn1 v28.2D, v26.2D, v8.2D // ......*........................................................... - // gap // .................................................................. - sub v8.8H, v29.8H, v23.8H // .......*.......................................................... - // gap // .................................................................. - add v19.8H, v29.8H, v23.8H // ........*......................................................... - // gap // .................................................................. - sub v12.8H, v18.8H, v28.8H // .........*........................................................ - // gap // .................................................................. - sqrdmulh v23.8H, v8.8H, v1.8H // ..............*................................................... - // gap // .................................................................. - mul v21.8H, v8.8H, v21.8H // ...........*...................................................... - // gap // .................................................................. - mul v26.8H, v12.8H, v17.8H // ............*..................................................... - // gap // .................................................................. - sqrdmulh v12.8H, v12.8H, v4.8H // .............*.................................................... - // gap // .................................................................. - add v17.8H, v18.8H, v28.8H // ..........*....................................................... - // gap // .................................................................. - mls v21.8H, v23.8H, v7.H[0] // ..................*............................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - mls v26.8H, v12.8H, v7.H[0] // .................*................................................ - // gap // .................................................................. - sub v8.8H, v17.8H, v19.8H // ...............*.................................................. - // gap // .................................................................. - ldr q1, [x3], #16 // .....................*............................................ - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sub v23.8H, v26.8H, v21.8H // ......................*........................................... - // gap // .................................................................. - sqrdmulh v18.8H, v8.8H, v9.8H // ....................*............................................. - // gap // .................................................................. - mul v28.8H, v8.8H, v5.8H // ...................*.............................................. - // gap // .................................................................. - mul v8.8H, v23.8H, v5.8H // .........................*........................................ - // gap // .................................................................. - sqrdmulh v23.8H, v23.8H, v9.8H // ..........................*....................................... - // gap // .................................................................. - add v12.8H, v26.8H, v21.8H // ........................*......................................... - // gap // .................................................................. - add v9.8H, v17.8H, v19.8H // ................*................................................. - // gap // .................................................................. - mls v28.8H, v18.8H, v7.H[0] // .......................*.......................................... - // gap // .................................................................. - mls v8.8H, v23.8H, v7.H[0] // .............................*.................................... - // gap // .................................................................. - trn1 v17.4S, v9.4S, v12.4S // ...........................*...................................... - // gap // .................................................................. - trn2 v26.4S, v9.4S, v12.4S // ............................*..................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - trn2 v12.4S, v28.4S, v8.4S // ...............................*.................................. - // gap // .................................................................. - trn1 v23.4S, v28.4S, v8.4S // ..............................*................................... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - trn2 v8.2D, v26.2D, v12.2D // .................................*................................ - // gap // .................................................................. - trn1 v12.2D, v26.2D, v12.2D // ...................................*.............................. - // gap // .................................................................. - trn1 v29.2D, v17.2D, v23.2D // ..................................*............................... - // gap // .................................................................. - trn2 v18.2D, v17.2D, v23.2D // ................................*................................. - // gap // .................................................................. - sub v17.8H, v29.8H, v12.8H // .....................................*............................ - // gap // .................................................................. - sub v23.8H, v18.8H, v8.8H // ....................................*............................. - // gap // .................................................................. - add v28.8H, v18.8H, v8.8H // .......................................*.......................... - // gap // .................................................................. - sqrdmulh v18.8H, v17.8H, v1.H[3] // .........................................*........................ - // gap // .................................................................. - add v4.8H, v29.8H, v12.8H // ......................................*........................... - // gap // .................................................................. - mul v26.8H, v23.8H, v1.H[4] // ..........................................*....................... - // gap // .................................................................. - sqrdmulh v23.8H, v23.8H, v1.H[5] // .............................................*.................... - // gap // .................................................................. - sqdmulh v12.8H, v4.8H, v7.H[1] // ...........................................*...................... - // gap // .................................................................. - sqdmulh v8.8H, v28.8H, v7.H[1] // ............................................*..................... - // gap // .................................................................. - mul v21.8H, v17.8H, v1.H[2] // ........................................*......................... - // gap // .................................................................. - mls v26.8H, v23.8H, v7.H[0] // .................................................*................ - // gap // .................................................................. - srshr v12.8H, v12.8H, #11 // ...............................................*.................. - // gap // .................................................................. - srshr v23.8H, v8.8H, #11 // ................................................*................. - // gap // .................................................................. - mls v21.8H, v18.8H, v7.H[0] // ..............................................*................... - // gap // .................................................................. - mls v4.8H, v12.8H, v7.H[0] // ..................................................*............... - // gap // .................................................................. - mls v28.8H, v23.8H, v7.H[0] // ...................................................*.............. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - sub v18.8H, v21.8H, v26.8H // ....................................................*............. - // gap // .................................................................. - add v17.8H, v21.8H, v26.8H // .....................................................*............ - // gap // .................................................................. - sub v12.8H, v4.8H, v28.8H // ......................................................*........... - // gap // .................................................................. - mul v26.8H, v18.8H, v1.H[0] // .......................................................*.......... - // gap // .................................................................. - sqrdmulh v18.8H, v18.8H, v1.H[1] // ........................................................*......... - // gap // .................................................................. - sqrdmulh v8.8H, v12.8H, v1.H[1] // ..........................................................*....... - // gap // .................................................................. - add v23.8H, v4.8H, v28.8H // ...........................................................*...... - // gap // .................................................................. - mul v12.8H, v12.8H, v1.H[0] // .........................................................*........ - // gap // .................................................................. - str q17, [x1, #16] // .............................................................*.... - // gap // .................................................................. - mls v26.8H, v18.8H, v7.H[0] // ............................................................*..... - // gap // .................................................................. - str q23, [x1], #(64) // ...............................................................*.. - // gap // .................................................................. - mls v12.8H, v8.8H, v7.H[0] // ..............................................................*... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - str q26, [x1, #-16] // .................................................................* - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - str q12, [x1, #-32] // ................................................................*. - // gap // .................................................................. + trn1 v22.4S, v23.4S, v27.4S // *....................................................................... + // gap // ........................................................................ + trn2 v0.4S, v23.4S, v27.4S // .*...................................................................... + // gap // ........................................................................ + trn2 v23.4S, v3.4S, v28.4S // ..*..................................................................... + // gap // ........................................................................ + trn2 v28.2D, v22.2D, v26.2D // ...*.................................................................... + // gap // ........................................................................ + trn1 v26.2D, v22.2D, v26.2D // ....*................................................................... + // gap // ........................................................................ + trn1 v22.2D, v0.2D, v23.2D // ......*................................................................. + // gap // ........................................................................ + trn2 v19.2D, v0.2D, v23.2D // .....*.................................................................. + // gap // ........................................................................ + sub v23.8H, v26.8H, v22.8H // .........*.............................................................. + // gap // ........................................................................ + add v3.8H, v28.8H, v19.8H // ........*............................................................... + // gap // ........................................................................ + sub v0.8H, v28.8H, v19.8H // .......*................................................................ + // gap // ........................................................................ + sqrdmulh v19.8H, v23.8H, v11.8H // .............*.......................................................... + // gap // ........................................................................ + mul v28.8H, v23.8H, v24.8H // ............*........................................................... + // gap // ........................................................................ + sqrdmulh v23.8H, v0.8H, v25.8H // ..............*......................................................... + // gap // ........................................................................ + mul v27.8H, v0.8H, v14.8H // ...........*............................................................ + // gap // ........................................................................ + add v24.8H, v26.8H, v22.8H // ..........*............................................................. + // gap // ........................................................................ + mls v28.8H, v19.8H, v7.H[0] // .................*...................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v27.8H, v23.8H, v7.H[0] // ..................*..................................................... + // gap // ........................................................................ + sub v19.8H, v24.8H, v3.8H // ...............*........................................................ + // gap // ........................................................................ + ldr q11, [x3], #16 // .....................*.................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v23.8H, v28.8H, v27.8H // ......................*................................................. + // gap // ........................................................................ + sqrdmulh v22.8H, v19.8H, v20.8H // ....................*................................................... + // gap // ........................................................................ + mul v0.8H, v19.8H, v29.8H // ...................*.................................................... + // gap // ........................................................................ + sqrdmulh v19.8H, v23.8H, v20.8H // ..........................*............................................. + // gap // ........................................................................ + mul v23.8H, v23.8H, v29.8H // .........................*.............................................. + // gap // ........................................................................ + add v28.8H, v28.8H, v27.8H // ........................*............................................... + // gap // ........................................................................ + add v27.8H, v24.8H, v3.8H // ................*....................................................... + // gap // ........................................................................ + mls v0.8H, v22.8H, v7.H[0] // .......................*................................................ + // gap // ........................................................................ + mls v23.8H, v19.8H, v7.H[0] // .............................*.......................................... + // gap // ........................................................................ + trn2 v22.4S, v27.4S, v28.4S // ............................*........................................... + // gap // ........................................................................ + trn1 v27.4S, v27.4S, v28.4S // ...........................*............................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v19.4S, v0.4S, v23.4S // ...............................*........................................ + // gap // ........................................................................ + trn1 v0.4S, v0.4S, v23.4S // ..............................*......................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v23.2D, v22.2D, v19.2D // .................................*...................................... + // gap // ........................................................................ + trn2 v28.2D, v27.2D, v0.2D // ................................*....................................... + // gap // ........................................................................ + trn1 v22.2D, v22.2D, v19.2D // ...................................*.................................... + // gap // ........................................................................ + sub v19.8H, v28.8H, v23.8H // ....................................*................................... + // gap // ........................................................................ + trn1 v26.2D, v27.2D, v0.2D // ..................................*..................................... + // gap // ........................................................................ + add v0.8H, v28.8H, v23.8H // ...........................................*............................ + // gap // ........................................................................ + sub v27.8H, v26.8H, v22.8H // .....................................*.................................. + // gap // ........................................................................ + sqrdmulh v23.8H, v19.8H, v11.H[5] // ..........................................*............................. + // gap // ........................................................................ + mul v24.8H, v19.8H, v11.H[4] // .......................................*................................ + // gap // ........................................................................ + sqrdmulh v28.8H, v27.8H, v11.H[3] // .........................................*.............................. + // gap // ........................................................................ + mul v3.8H, v27.8H, v11.H[2] // ........................................*............................... + // gap // ........................................................................ + sqdmulh v19.8H, v0.8H, v7.H[1] // ...............................................*........................ + // gap // ........................................................................ + mls v24.8H, v23.8H, v7.H[0] // ..............................................*......................... + // gap // ........................................................................ + add v27.8H, v26.8H, v22.8H // ......................................*................................. + // gap // ........................................................................ + mls v3.8H, v28.8H, v7.H[0] // .............................................*.......................... + // gap // ........................................................................ + srshr v19.8H, v19.8H, #11 // ....................................................*................... + // gap // ........................................................................ + sqdmulh v22.8H, v27.8H, v7.H[1] // ............................................*........................... + // gap // ........................................................................ + sqdmulh v26.8H, v24.8H, v7.H[1] // ..................................................*..................... + // gap // ........................................................................ + sqdmulh v23.8H, v3.8H, v7.H[1] // .................................................*...................... + // gap // ........................................................................ + mls v0.8H, v19.8H, v7.H[0] // .......................................................*................ + // gap // ........................................................................ + srshr v22.8H, v22.8H, #11 // ................................................*....................... + // gap // ........................................................................ + srshr v19.8H, v26.8H, #11 // ......................................................*................. + // gap // ........................................................................ + srshr v23.8H, v23.8H, #11 // .....................................................*.................. + // gap // ........................................................................ + mls v27.8H, v22.8H, v7.H[0] // ...................................................*.................... + // gap // ........................................................................ + mls v24.8H, v19.8H, v7.H[0] // .........................................................*.............. + // gap // ........................................................................ + mls v3.8H, v23.8H, v7.H[0] // ........................................................*............... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v19.8H, v27.8H, v0.8H // ..........................................................*............. + // gap // ........................................................................ + add v27.8H, v27.8H, v0.8H // ...........................................................*............ + // gap // ........................................................................ + sub v22.8H, v3.8H, v24.8H // ............................................................*........... + // gap // ........................................................................ + mul v0.8H, v19.8H, v11.H[0] // .............................................................*.......... + // gap // ........................................................................ + sqrdmulh v28.8H, v19.8H, v11.H[1] // ..............................................................*......... + // gap // ........................................................................ + sqrdmulh v23.8H, v22.8H, v11.H[1] // ................................................................*....... + // gap // ........................................................................ + mul v22.8H, v22.8H, v11.H[0] // ...............................................................*........ + // gap // ........................................................................ + add v19.8H, v3.8H, v24.8H // .................................................................*...... + // gap // ........................................................................ + mls v0.8H, v28.8H, v7.H[0] // ..................................................................*..... + // gap // ........................................................................ + str q27, [x1], #(64) // ...................................................................*.... + // gap // ........................................................................ + mls v22.8H, v23.8H, v7.H[0] // ....................................................................*... + // gap // ........................................................................ + str q19, [x1, #-48] // .....................................................................*.. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q0, [x1, #-32] // ......................................................................*. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q22, [x1, #-16] // .......................................................................* + // gap // ........................................................................ // original source code - // trn1 v23.4S, v8.4S, v26.4S // *................................................................. - // trn2 v8.4S, v8.4S, v26.4S // .*................................................................ - // trn2 v12.4S, v28.4S, v18.4S // ..*............................................................... - // trn2 v18.2D, v23.2D, v19.2D // ...*.............................................................. - // trn1 v23.2D, v23.2D, v19.2D // ....*............................................................. - // trn2 v26.2D, v8.2D, v12.2D // .....*............................................................ - // trn1 v8.2D, v8.2D, v12.2D // ......*........................................................... - // sub v12.8H, v18.8H, v26.8H // .......*.......................................................... - // add v18.8H, v18.8H, v26.8H // ........*......................................................... - // sub v26.8H, v23.8H, v8.8H // .........*........................................................ - // add v23.8H, v23.8H, v8.8H // ..............*................................................... - // mul v8.8H, v12.8H, v21.8H // ...........*...................................................... - // mul v17.8H, v26.8H, v17.8H // ............*..................................................... - // sqrdmulh v26.8H, v26.8H, v4.8H // .............*.................................................... - // sqrdmulh v12.8H, v12.8H, v1.8H // ..........*....................................................... - // sub v28.8H, v23.8H, v18.8H // .................*................................................ - // add v23.8H, v23.8H, v18.8H // .........................*........................................ - // mls v17.8H, v26.8H, v7.H[0] // ................*................................................. - // mls v8.8H, v12.8H, v7.H[0] // ...............*.................................................. - // mul v12.8H, v28.8H, v5.8H // .....................*............................................ - // sqrdmulh v18.8H, v28.8H, v9.8H // ....................*............................................. - // ldr q4, [x3], #16 // ..................*............................................... - // sub v26.8H, v17.8H, v8.8H // ...................*.............................................. - // mls v12.8H, v18.8H, v7.H[0] // ..........................*....................................... - // add v8.8H, v17.8H, v8.8H // ........................*......................................... - // mul v18.8H, v26.8H, v5.8H // ......................*........................................... - // sqrdmulh v26.8H, v26.8H, v9.8H // .......................*.......................................... - // trn1 v17.4S, v23.4S, v8.4S // ............................*..................................... - // trn2 v23.4S, v23.4S, v8.4S // .............................*.................................... - // mls v18.8H, v26.8H, v7.H[0] // ...........................*...................................... - // trn1 v19.4S, v12.4S, v18.4S // ...............................*.................................. - // trn2 v12.4S, v12.4S, v18.4S // ..............................*................................... - // trn2 v5.2D, v17.2D, v19.2D // ...................................*.............................. - // trn2 v9.2D, v23.2D, v12.2D // ................................*................................. - // trn1 v17.2D, v17.2D, v19.2D // ..................................*............................... - // trn1 v23.2D, v23.2D, v12.2D // .................................*................................ - // sub v12.8H, v5.8H, v9.8H // .....................................*............................ - // sub v19.8H, v17.8H, v23.8H // ....................................*............................. - // add v23.8H, v17.8H, v23.8H // ........................................*......................... - // add v17.8H, v5.8H, v9.8H // ......................................*........................... - // mul v5.8H, v19.8H, v4.H[2] // .............................................*.................... - // sqrdmulh v19.8H, v19.8H, v4.H[3] // .......................................*.......................... - // mul v9.8H, v12.8H, v4.H[4] // .........................................*........................ - // sqdmulh v21.8H, v23.8H, v7.H[1] // ...........................................*...................... - // sqdmulh v1.8H, v17.8H, v7.H[1] // ............................................*..................... - // sqrdmulh v12.8H, v12.8H, v4.H[5] // ..........................................*....................... - // mls v5.8H, v19.8H, v7.H[0] // .................................................*................ - // srshr v19.8H, v21.8H, #11 // ...............................................*.................. - // srshr v21.8H, v1.8H, #11 // ................................................*................. - // mls v9.8H, v12.8H, v7.H[0] // ..............................................*................... - // mls v23.8H, v19.8H, v7.H[0] // ..................................................*............... - // mls v17.8H, v21.8H, v7.H[0] // ...................................................*.............. - // sub v12.8H, v5.8H, v9.8H // ....................................................*............. - // add v5.8H, v5.8H, v9.8H // .....................................................*............ - // sub v9.8H, v23.8H, v17.8H // ......................................................*........... - // mul v21.8H, v12.8H, v4.H[0] // .......................................................*.......... - // sqrdmulh v12.8H, v12.8H, v4.H[1] // ........................................................*......... - // mul v1.8H, v9.8H, v4.H[0] // ...........................................................*...... - // sqrdmulh v4.8H, v9.8H, v4.H[1] // .........................................................*........ - // add v23.8H, v23.8H, v17.8H // ..........................................................*....... - // mls v21.8H, v12.8H, v7.H[0] // .............................................................*.... - // str q5, [x1, #16] // ............................................................*..... - // mls v1.8H, v4.8H, v7.H[0] // ...............................................................*.. - // str q23, [x1], #(64) // ..............................................................*... - // str q1, [x1, #-32] // .................................................................* - // str q21, [x1, #-16] // ................................................................*. + // trn1 v19.4S, v23.4S, v27.4S // *....................................................................... + // trn2 v23.4S, v23.4S, v27.4S // .*...................................................................... + // trn2 v22.4S, v3.4S, v28.4S // ..*..................................................................... + // trn2 v28.2D, v19.2D, v26.2D // ...*.................................................................... + // trn1 v19.2D, v19.2D, v26.2D // ....*................................................................... + // trn2 v0.2D, v23.2D, v22.2D // ......*................................................................. + // trn1 v23.2D, v23.2D, v22.2D // .....*.................................................................. + // sub v22.8H, v28.8H, v0.8H // .........*.............................................................. + // add v28.8H, v28.8H, v0.8H // ........*............................................................... + // sub v0.8H, v19.8H, v23.8H // .......*................................................................ + // add v19.8H, v19.8H, v23.8H // ..............*......................................................... + // mul v23.8H, v22.8H, v14.8H // .............*.......................................................... + // mul v27.8H, v0.8H, v24.8H // ...........*............................................................ + // sqrdmulh v0.8H, v0.8H, v11.8H // ..........*............................................................. + // sqrdmulh v22.8H, v22.8H, v25.8H // ............*........................................................... + // sub v24.8H, v19.8H, v28.8H // .................*...................................................... + // add v19.8H, v19.8H, v28.8H // .........................*.............................................. + // mls v27.8H, v0.8H, v7.H[0] // ...............*........................................................ + // mls v23.8H, v22.8H, v7.H[0] // ................*....................................................... + // mul v22.8H, v24.8H, v29.8H // .....................*.................................................. + // sqrdmulh v28.8H, v24.8H, v20.8H // ....................*................................................... + // ldr q0, [x3], #16 // ..................*..................................................... + // sub v24.8H, v27.8H, v23.8H // ...................*.................................................... + // mls v22.8H, v28.8H, v7.H[0] // ..........................*............................................. + // add v23.8H, v27.8H, v23.8H // ........................*............................................... + // mul v28.8H, v24.8H, v29.8H // .......................*................................................ + // sqrdmulh v27.8H, v24.8H, v20.8H // ......................*................................................. + // trn1 v24.4S, v19.4S, v23.4S // .............................*.......................................... + // trn2 v19.4S, v19.4S, v23.4S // ............................*........................................... + // mls v28.8H, v27.8H, v7.H[0] // ...........................*............................................ + // trn1 v26.4S, v22.4S, v28.4S // ...............................*........................................ + // trn2 v22.4S, v22.4S, v28.4S // ..............................*......................................... + // trn2 v29.2D, v24.2D, v26.2D // .................................*...................................... + // trn2 v20.2D, v19.2D, v22.2D // ................................*....................................... + // trn1 v24.2D, v24.2D, v26.2D // ....................................*................................... + // trn1 v19.2D, v19.2D, v22.2D // ..................................*..................................... + // sub v22.8H, v29.8H, v20.8H // ...................................*.................................... + // sub v26.8H, v24.8H, v19.8H // ......................................*................................. + // add v19.8H, v24.8H, v19.8H // .............................................*.......................... + // mul v24.8H, v22.8H, v0.H[4] // ........................................*............................... + // mul v11.8H, v26.8H, v0.H[2] // ..........................................*............................. + // sqrdmulh v26.8H, v26.8H, v0.H[3] // .........................................*.............................. + // sqrdmulh v22.8H, v22.8H, v0.H[5] // .......................................*................................ + // add v29.8H, v29.8H, v20.8H // .....................................*.................................. + // sqdmulh v20.8H, v19.8H, v7.H[1] // ................................................*....................... + // mls v11.8H, v26.8H, v7.H[0] // ..............................................*......................... + // mls v24.8H, v22.8H, v7.H[0] // ............................................*........................... + // sqdmulh v22.8H, v29.8H, v7.H[1] // ...........................................*............................ + // srshr v26.8H, v20.8H, #11 // ....................................................*................... + // sqdmulh v20.8H, v11.8H, v7.H[1] // ..................................................*..................... + // sqdmulh v14.8H, v24.8H, v7.H[1] // .................................................*...................... + // mls v19.8H, v26.8H, v7.H[0] // .......................................................*................ + // srshr v22.8H, v22.8H, #11 // ...............................................*........................ + // srshr v26.8H, v20.8H, #11 // ......................................................*................. + // srshr v20.8H, v14.8H, #11 // .....................................................*.................. + // mls v29.8H, v22.8H, v7.H[0] // ...................................................*.................... + // mls v11.8H, v26.8H, v7.H[0] // .........................................................*.............. + // mls v24.8H, v20.8H, v7.H[0] // ........................................................*............... + // sub v22.8H, v19.8H, v29.8H // ..........................................................*............. + // add v19.8H, v19.8H, v29.8H // ...........................................................*............ + // sub v29.8H, v11.8H, v24.8H // ............................................................*........... + // mul v20.8H, v22.8H, v0.H[0] // .............................................................*.......... + // sqrdmulh v22.8H, v22.8H, v0.H[1] // ..............................................................*......... + // mul v14.8H, v29.8H, v0.H[0] // ................................................................*....... + // sqrdmulh v0.8H, v29.8H, v0.H[1] // ...............................................................*........ + // add v24.8H, v11.8H, v24.8H // .................................................................*...... + // mls v20.8H, v22.8H, v7.H[0] // ..................................................................*..... + // str q19, [x1], #(64) // ...................................................................*.... + // mls v14.8H, v0.8H, v7.H[0] // ....................................................................*... + // str q24, [x1, #-48] // .....................................................................*.. + // str q20, [x1, #-32] // ......................................................................*. + // str q14, [x1, #-16] // .......................................................................* // --------------------------------------------------------------------- @@ -900,616 +936,580 @@ layer4567_start: .p2align 2 - ldr q17, [x0, #64] // *...... + ldr q24, [x0, #64] // *...... // gap // ....... // gap // ....... // gap // ....... - ldr q12, [x0, #128] // .*..... + ldr q22, [x0, #128] // .*..... // gap // ....... // gap // ....... // gap // ....... - ldr q18, [x0, #192] // ..*.... + ldr q28, [x0, #192] // ..*.... // gap // ....... // gap // ....... // gap // ....... - ldr q28, [x0, #256] // ...*... + ldr q3, [x0, #256] // ...*... // gap // ....... // gap // ....... // gap // ....... - ldr q4, [x0, #320] // ....*.. + ldr q26, [x0, #320] // ....*.. // gap // ....... // gap // ....... // gap // ....... - ldr q9, [x0, #384] // .....*. + ldr q11, [x0, #384] // .....*. // gap // ....... // gap // ....... // gap // ....... - ldr q21, [x0, #448] // ......* + ldr q14, [x0, #448] // ......* // gap // ....... // original source code - // ldr q17, [x0, #64] // *...... - // ldr q12, [x0, #128] // .*..... - // ldr q18, [x0, #192] // ..*.... - // ldr q28, [x0, #256] // ...*... - // ldr q4, [x0, #320] // ....*.. - // ldr q9, [x0, #384] // .....*. - // ldr q21, [x0, #448] // ......* + // ldr q24, [x0, #64] // *...... + // ldr q22, [x0, #128] // .*..... + // ldr q28, [x0, #192] // ..*.... + // ldr q3, [x0, #256] // ...*... + // ldr q26, [x0, #320] // ....*.. + // ldr q11, [x0, #384] // .....*. + // ldr q14, [x0, #448] // ......* sub count, count, #1 layer123_start: - ldr q8, [x0, #0] // *............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v23.8H, v12.8H, v18.8H // .............*................................................................................ - // gap // .............................................................................................. - add v12.8H, v12.8H, v18.8H // ..............*............................................................................... - // gap // .............................................................................................. - sub v18.8H, v8.8H, v17.8H // ........*..................................................................................... - // gap // .............................................................................................. - add v8.8H, v8.8H, v17.8H // .........*.................................................................................... - // gap // .............................................................................................. - mul v26.8H, v23.8H, v1.H[0] // ...............*.............................................................................. - // gap // .............................................................................................. - sqrdmulh v23.8H, v23.8H, v1.H[1] // ................*............................................................................. - // gap // .............................................................................................. - sub v17.8H, v8.8H, v12.8H // ............................*................................................................. - // gap // .............................................................................................. - add v8.8H, v8.8H, v12.8H // .............................*................................................................ - // gap // .............................................................................................. - mul v12.8H, v18.8H, v0.H[6] // ..........*................................................................................... - // gap // .............................................................................................. - sqrdmulh v18.8H, v18.8H, v0.H[7] // ...........*.................................................................................. - // gap // .............................................................................................. - mls v26.8H, v23.8H, v7.H[0] // .................*............................................................................ - // gap // .............................................................................................. - sub v23.8H, v28.8H, v4.8H // ..................*........................................................................... - // gap // .............................................................................................. - add v28.8H, v28.8H, v4.8H // ...................*.......................................................................... - // gap // .............................................................................................. - mls v12.8H, v18.8H, v7.H[0] // ............*................................................................................. - // gap // .............................................................................................. - mul v18.8H, v23.8H, v1.H[2] // ....................*......................................................................... - // gap // .............................................................................................. - mul v4.8H, v17.8H, v0.H[2] // ..............................*............................................................... - // gap // .............................................................................................. - sqrdmulh v17.8H, v17.8H, v0.H[3] // ...............................*.............................................................. - // gap // .............................................................................................. - sqdmulh v19.8H, v8.8H, v7.H[1] // ................................................*............................................. - // gap // .............................................................................................. - sqrdmulh v23.8H, v23.8H, v1.H[3] // .....................*........................................................................ - // gap // .............................................................................................. - sub v5.8H, v9.8H, v21.8H // .......................*...................................................................... - // gap // .............................................................................................. - add v9.8H, v9.8H, v21.8H // ........................*..................................................................... - // gap // .............................................................................................. - srshr v19.8H, v19.8H, #11 // .................................................*............................................ - // gap // .............................................................................................. - mls v18.8H, v23.8H, v7.H[0] // ......................*....................................................................... - // gap // .............................................................................................. - sub v23.8H, v12.8H, v26.8H // .................................*............................................................ - // gap // .............................................................................................. - mls v8.8H, v19.8H, v7.H[0] // ..................................................*........................................... - // gap // .............................................................................................. - add v12.8H, v12.8H, v26.8H // ..................................*........................................................... - // gap // .............................................................................................. - mul v26.8H, v5.8H, v1.H[4] // .........................*.................................................................... - // gap // .............................................................................................. - sub v19.8H, v28.8H, v9.8H // ......................................*....................................................... - // gap // .............................................................................................. - add v28.8H, v28.8H, v9.8H // .......................................*...................................................... - // gap // .............................................................................................. - sqrdmulh v5.8H, v5.8H, v1.H[5] // ..........................*................................................................... - // gap // .............................................................................................. - mls v4.8H, v17.8H, v7.H[0] // ................................*............................................................. - // gap // .............................................................................................. - mul v17.8H, v23.8H, v0.H[2] // ...................................*.......................................................... - // gap // .............................................................................................. - sqrdmulh v23.8H, v23.8H, v0.H[3] // ....................................*......................................................... - // gap // .............................................................................................. - mls v26.8H, v5.8H, v7.H[0] // ...........................*.................................................................. - // gap // .............................................................................................. - mul v5.8H, v19.8H, v0.H[4] // ........................................*..................................................... - // gap // .............................................................................................. - sqrdmulh v19.8H, v19.8H, v0.H[5] // .........................................*.................................................... - // gap // .............................................................................................. - mls v17.8H, v23.8H, v7.H[0] // .....................................*........................................................ - // gap // .............................................................................................. - sub v23.8H, v18.8H, v26.8H // ...........................................*.................................................. - // gap // .............................................................................................. - add v18.8H, v18.8H, v26.8H // ............................................*................................................. - // gap // .............................................................................................. - mls v5.8H, v19.8H, v7.H[0] // ..........................................*................................................... - // gap // .............................................................................................. - mul v26.8H, v23.8H, v0.H[4] // .............................................*................................................ - // gap // .............................................................................................. - sqrdmulh v23.8H, v23.8H, v0.H[5] // ..............................................*............................................... - // gap // .............................................................................................. - sqdmulh v19.8H, v28.8H, v7.H[1] // ...................................................*.......................................... - // gap // .............................................................................................. - sub v9.8H, v12.8H, v18.8H // ...........................................................*.................................. - // gap // .............................................................................................. - add v12.8H, v12.8H, v18.8H // ............................................................*................................. - // gap // .............................................................................................. - mls v26.8H, v23.8H, v7.H[0] // ...............................................*.............................................. - // gap // .............................................................................................. - srshr v23.8H, v19.8H, #11 // ....................................................*......................................... - // gap // .............................................................................................. - mul v18.8H, v9.8H, v0.H[0] // .............................................................*................................ - // gap // .............................................................................................. - sqrdmulh v19.8H, v9.8H, v0.H[1] // ..............................................................*............................... - // gap // .............................................................................................. - mls v28.8H, v23.8H, v7.H[0] // .....................................................*........................................ - // gap // .............................................................................................. - sub v23.8H, v4.8H, v5.8H // ................................................................*............................. - // gap // .............................................................................................. - add v4.8H, v4.8H, v5.8H // .................................................................*............................ - // gap // .............................................................................................. - mls v18.8H, v19.8H, v7.H[0] // ...............................................................*.............................. - // gap // .............................................................................................. - sub v19.8H, v8.8H, v28.8H // ......................................................*....................................... - // gap // .............................................................................................. - add v8.8H, v8.8H, v28.8H // .......................................................*...................................... - // gap // .............................................................................................. - mul v28.8H, v23.8H, v0.H[0] // ..................................................................*........................... - // gap // .............................................................................................. - mul v5.8H, v19.8H, v0.H[0] // ........................................................*..................................... - // gap // .............................................................................................. - sqrdmulh v19.8H, v19.8H, v0.H[1] // .........................................................*.................................... - // gap // .............................................................................................. - sqrdmulh v23.8H, v23.8H, v0.H[1] // ...................................................................*.......................... - // gap // .............................................................................................. - sub v9.8H, v17.8H, v26.8H // .....................................................................*........................ - // gap // .............................................................................................. - add v26.8H, v17.8H, v26.8H // ......................................................................*....................... - // gap // .............................................................................................. - mls v5.8H, v19.8H, v7.H[0] // ..........................................................*................................... - // gap // .............................................................................................. - mls v28.8H, v23.8H, v7.H[0] // ....................................................................*......................... - // gap // .............................................................................................. - mul v23.8H, v9.8H, v0.H[0] // .......................................................................*...................... - // gap // .............................................................................................. - sqrdmulh v17.8H, v9.8H, v0.H[1] // ........................................................................*..................... - // gap // .............................................................................................. - str q5, [x0, #256] // ..........................................................................*................... - // gap // .............................................................................................. - mul v19.8H, v8.8H, v29.8H // ..............................................................................*............... - // gap // .............................................................................................. - str q18, [x0, #320] // ...........................................................................*.................. - // gap // .............................................................................................. - mls v23.8H, v17.8H, v7.H[0] // .........................................................................*.................... - // gap // .............................................................................................. - str q28, [x0, #384] // ............................................................................*................. - // gap // .............................................................................................. - sqrdmulh v8.8H, v8.8H, v30.8H // ...............................................................................*.............. - // gap // .............................................................................................. - mul v18.8H, v12.8H, v29.8H // .................................................................................*............ - // gap // .............................................................................................. - str q23, [x0, #448] // .............................................................................*................ - // gap // .............................................................................................. - sqrdmulh v23.8H, v12.8H, v30.8H // ..................................................................................*........... - // gap // .............................................................................................. - mls v19.8H, v8.8H, v7.H[0] // ................................................................................*............. - // gap // .............................................................................................. - mul v8.8H, v4.8H, v29.8H // ....................................................................................*......... - // gap // .............................................................................................. - sqrdmulh v12.8H, v4.8H, v30.8H // .....................................................................................*........ - // gap // .............................................................................................. - mls v18.8H, v23.8H, v7.H[0] // ...................................................................................*.......... - // gap // .............................................................................................. - mul v23.8H, v26.8H, v29.8H // .......................................................................................*...... - // gap // .............................................................................................. - sqrdmulh v26.8H, v26.8H, v30.8H // ........................................................................................*..... - // gap // .............................................................................................. - mls v8.8H, v12.8H, v7.H[0] // ......................................................................................*....... - // gap // .............................................................................................. - str q19, [x0], #(16) // ..........................................................................................*... - // gap // .............................................................................................. - ldr q17, [x0, #64] // .e............................................................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v23.8H, v26.8H, v7.H[0] // .........................................................................................*.... - // gap // .............................................................................................. - str q18, [x0, #48] // ...........................................................................................*.. - // gap // .............................................................................................. - ldr q12, [x0, #128] // ..e........................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - str q8, [x0, #112] // ............................................................................................*. - // gap // .............................................................................................. - ldr q18, [x0, #192] // ...e.......................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - str q23, [x0, #176] // .............................................................................................* - // gap // .............................................................................................. - ldr q28, [x0, #256] // ....e......................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - ldr q4, [x0, #320] // .....e........................................................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - ldr q9, [x0, #384] // ......e....................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - ldr q21, [x0, #448] // .......e...................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. + ldr q23, [x0, #0] // *....................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v19.8H, v22.8H, v28.8H // .............*.......................................................................... + // gap // ........................................................................................ + add v22.8H, v22.8H, v28.8H // ..............*......................................................................... + // gap // ........................................................................................ + sub v28.8H, v23.8H, v24.8H // ........*............................................................................... + // gap // ........................................................................................ + add v23.8H, v23.8H, v24.8H // .........*.............................................................................. + // gap // ........................................................................................ + mul v27.8H, v19.8H, v1.H[0] // ...............*........................................................................ + // gap // ........................................................................................ + sqrdmulh v8.8H, v19.8H, v1.H[1] // ................*....................................................................... + // gap // ........................................................................................ + sub v24.8H, v23.8H, v22.8H // ............................*........................................................... + // gap // ........................................................................................ + add v21.8H, v23.8H, v22.8H // .............................*.......................................................... + // gap // ........................................................................................ + mul v22.8H, v28.8H, v0.H[6] // ..........*............................................................................. + // gap // ........................................................................................ + sqrdmulh v28.8H, v28.8H, v0.H[7] // ...........*............................................................................ + // gap // ........................................................................................ + mls v27.8H, v8.8H, v7.H[0] // .................*...................................................................... + // gap // ........................................................................................ + sub v19.8H, v3.8H, v26.8H // ..................*..................................................................... + // gap // ........................................................................................ + add v3.8H, v3.8H, v26.8H // ...................*.................................................................... + // gap // ........................................................................................ + mls v22.8H, v28.8H, v7.H[0] // ............*........................................................................... + // gap // ........................................................................................ + mul v28.8H, v19.8H, v1.H[2] // ....................*................................................................... + // gap // ........................................................................................ + add v25.8H, v11.8H, v14.8H // ........................*............................................................... + // gap // ........................................................................................ + mul v26.8H, v24.8H, v0.H[2] // ..............................*......................................................... + // gap // ........................................................................................ + sqrdmulh v24.8H, v24.8H, v0.H[3] // ...............................*........................................................ + // gap // ........................................................................................ + add v20.8H, v3.8H, v25.8H // .......................................*................................................ + // gap // ........................................................................................ + sqrdmulh v19.8H, v19.8H, v1.H[3] // .....................*.................................................................. + // gap // ........................................................................................ + sub v11.8H, v11.8H, v14.8H // .......................*................................................................ + // gap // ........................................................................................ + add v23.8H, v21.8H, v20.8H // .................................................*...................................... + // gap // ........................................................................................ + sub v14.8H, v21.8H, v20.8H // ................................................*....................................... + // gap // ........................................................................................ + mls v28.8H, v19.8H, v7.H[0] // ......................*................................................................. + // gap // ........................................................................................ + sub v19.8H, v22.8H, v27.8H // .................................*...................................................... + // gap // ........................................................................................ + add v22.8H, v22.8H, v27.8H // ..................................*..................................................... + // gap // ........................................................................................ + mul v27.8H, v11.8H, v1.H[4] // .........................*.............................................................. + // gap // ........................................................................................ + sqrdmulh v20.8H, v11.8H, v1.H[5] // ..........................*............................................................. + // gap // ........................................................................................ + sub v3.8H, v3.8H, v25.8H // ......................................*................................................. + // gap // ........................................................................................ + mls v26.8H, v24.8H, v7.H[0] // ................................*....................................................... + // gap // ........................................................................................ + mul v24.8H, v19.8H, v0.H[2] // ...................................*.................................................... + // gap // ........................................................................................ + mls v27.8H, v20.8H, v7.H[0] // ...........................*............................................................ + // gap // ........................................................................................ + sqrdmulh v19.8H, v19.8H, v0.H[3] // ....................................*................................................... + // gap // ........................................................................................ + mul v20.8H, v3.8H, v0.H[4] // ........................................*............................................... + // gap // ........................................................................................ + sqrdmulh v3.8H, v3.8H, v0.H[5] // .........................................*.............................................. + // gap // ........................................................................................ + sub v11.8H, v28.8H, v27.8H // ...........................................*............................................ + // gap // ........................................................................................ + mls v24.8H, v19.8H, v7.H[0] // .....................................*.................................................. + // gap // ........................................................................................ + add v19.8H, v28.8H, v27.8H // ............................................*........................................... + // gap // ........................................................................................ + mul v28.8H, v11.8H, v0.H[4] // .............................................*.......................................... + // gap // ........................................................................................ + sqrdmulh v27.8H, v11.8H, v0.H[5] // ..............................................*......................................... + // gap // ........................................................................................ + mls v20.8H, v3.8H, v7.H[0] // ..........................................*............................................. + // gap // ........................................................................................ + mul v3.8H, v14.8H, v0.H[0] // ..................................................*..................................... + // gap // ........................................................................................ + sqrdmulh v25.8H, v14.8H, v0.H[1] // ...................................................*.................................... + // gap // ........................................................................................ + add v8.8H, v22.8H, v19.8H // ......................................................*................................. + // gap // ........................................................................................ + sub v14.8H, v22.8H, v19.8H // .....................................................*.................................. + // gap // ........................................................................................ + mls v28.8H, v27.8H, v7.H[0] // ...............................................*........................................ + // gap // ........................................................................................ + mul v22.8H, v23.8H, v29.8H // ........................................................................*............... + // gap // ........................................................................................ + sqrdmulh v11.8H, v14.8H, v0.H[1] // ........................................................*............................... + // gap // ........................................................................................ + sqrdmulh v23.8H, v23.8H, v30.8H // .........................................................................*.............. + // gap // ........................................................................................ + mls v3.8H, v25.8H, v7.H[0] // ....................................................*................................... + // gap // ........................................................................................ + mul v27.8H, v14.8H, v0.H[0] // .......................................................*................................ + // gap // ........................................................................................ + sub v14.8H, v26.8H, v20.8H // ..........................................................*............................. + // gap // ........................................................................................ + add v26.8H, v26.8H, v20.8H // ...........................................................*............................ + // gap // ........................................................................................ + sub v20.8H, v24.8H, v28.8H // ...............................................................*........................ + // gap // ........................................................................................ + mls v27.8H, v11.8H, v7.H[0] // .........................................................*.............................. + // gap // ........................................................................................ + mul v11.8H, v14.8H, v0.H[0] // ............................................................*........................... + // gap // ........................................................................................ + sqrdmulh v14.8H, v14.8H, v0.H[1] // .............................................................*.......................... + // gap // ........................................................................................ + add v28.8H, v24.8H, v28.8H // ................................................................*....................... + // gap // ........................................................................................ + str q27, [x0, #320] // .....................................................................*.................. + // gap // ........................................................................................ + mul v24.8H, v20.8H, v0.H[0] // .................................................................*...................... + // gap // ........................................................................................ + sqrdmulh v20.8H, v20.8H, v0.H[1] // ..................................................................*..................... + // gap // ........................................................................................ + str q3, [x0, #256] // ....................................................................*................... + // gap // ........................................................................................ + mls v11.8H, v14.8H, v7.H[0] // ..............................................................*......................... + // gap // ........................................................................................ + mls v22.8H, v23.8H, v7.H[0] // ..........................................................................*............. + // gap // ........................................................................................ + mls v24.8H, v20.8H, v7.H[0] // ...................................................................*.................... + // gap // ........................................................................................ + mul v23.8H, v8.8H, v29.8H // ...........................................................................*............ + // gap // ........................................................................................ + str q11, [x0, #384] // ......................................................................*................. + // gap // ........................................................................................ + sqrdmulh v19.8H, v8.8H, v30.8H // ............................................................................*........... + // gap // ........................................................................................ + str q24, [x0, #448] // .......................................................................*................ + // gap // ........................................................................................ + mul v27.8H, v26.8H, v29.8H // ..............................................................................*......... + // gap // ........................................................................................ + str q22, [x0], #(16) // ....................................................................................*... + // gap // ........................................................................................ + sqrdmulh v24.8H, v26.8H, v30.8H // ...............................................................................*........ + // gap // ........................................................................................ + mls v23.8H, v19.8H, v7.H[0] // .............................................................................*.......... + // gap // ........................................................................................ + mul v19.8H, v28.8H, v29.8H // .................................................................................*...... + // gap // ........................................................................................ + sqrdmulh v8.8H, v28.8H, v30.8H // ..................................................................................*..... + // gap // ........................................................................................ + mls v27.8H, v24.8H, v7.H[0] // ................................................................................*....... + // gap // ........................................................................................ + ldr q24, [x0, #64] // .e...................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v19.8H, v8.8H, v7.H[0] // ...................................................................................*.... + // gap // ........................................................................................ + str q23, [x0, #48] // .....................................................................................*.. + // gap // ........................................................................................ + ldr q22, [x0, #128] // ..e..................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q27, [x0, #112] // ......................................................................................*. + // gap // ........................................................................................ + ldr q28, [x0, #192] // ...e.................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q19, [x0, #176] // .......................................................................................* + // gap // ........................................................................................ + ldr q3, [x0, #256] // ....e................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q26, [x0, #320] // .....e.................................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q11, [x0, #384] // ......e................................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q14, [x0, #448] // .......e................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ // original source code - // ldr q8, [x0, #0] // ...........*......................................................................................... - // ldr q9, [x0, #(1*(512/8))] // e..........|..................................................................................e...... - // ldr q10, [x0, #(2*(512/8))] // ...e.......|.....................................................................................e... - // ldr q11, [x0, #(3*(512/8))] // .....e.....|.......................................................................................e. - // ldr q12, [x0, #(4*(512/8))] // .......e...|......................................................................................... - // ldr q13, [x0, #(5*(512/8))] // ........e..|......................................................................................... - // ldr q14, [x0, #(6*(512/8))] // .........e.|......................................................................................... - // ldr q15, [x0, #(7*(512/8))] // ..........e|......................................................................................... - // sub v24.8h, v8.8h, v9.8h // ...........|..*...................................................................................... - // add v8.8h, v8.8h, v9.8h // ...........|...*..................................................................................... - // mul v9.8h, v24.8h, v0.h[6] // ...........|........*................................................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[7] // ...........|.........*............................................................................... - // mls v9.8h, v24.8h, v7.h[0] // ...........|.............*........................................................................... - // sub v24.8h, v10.8h, v11.8h // ...........|*........................................................................................ - // add v10.8h, v10.8h, v11.8h // ...........|.*....................................................................................... - // mul v11.8h, v24.8h, v1.h[0] // ...........|....*.................................................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[1] // ...........|.....*................................................................................... - // mls v11.8h, v24.8h, v7.h[0] // ...........|..........*.............................................................................. - // sub v24.8h, v12.8h, v13.8h // ...........|...........*............................................................................. - // add v12.8h, v12.8h, v13.8h // ...........|............*............................................................................ - // mul v13.8h, v24.8h, v1.h[2] // ...........|..............*.......................................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[3] // ...........|..................*...................................................................... - // mls v13.8h, v24.8h, v7.h[0] // ...........|......................*.................................................................. - // sub v24.8h, v14.8h, v15.8h // ...........|...................*..................................................................... - // add v14.8h, v14.8h, v15.8h // ...........|....................*.................................................................... - // mul v15.8h, v24.8h, v1.h[4] // ...........|..........................*.............................................................. - // sqrdmulh v24.8h, v24.8h, v1.h[5] // ...........|.............................*........................................................... - // mls v15.8h, v24.8h, v7.h[0] // ...........|.................................*....................................................... - // sub v24.8h, v8.8h, v10.8h // ...........|......*.................................................................................. - // add v8.8h, v8.8h, v10.8h // ...........|.......*................................................................................. - // mul v10.8h, v24.8h, v0.h[2] // ...........|...............*......................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........|................*........................................................................ - // mls v10.8h, v24.8h, v7.h[0] // ...........|..............................*.......................................................... - // sub v24.8h, v9.8h, v11.8h // ...........|.......................*................................................................. - // add v9.8h, v9.8h, v11.8h // ...........|.........................*............................................................... - // mul v11.8h, v24.8h, v0.h[2] // ...........|...............................*......................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........|................................*........................................................ - // mls v11.8h, v24.8h, v7.h[0] // ...........|....................................*.................................................... - // sub v24.8h, v12.8h, v14.8h // ...........|...........................*............................................................. - // add v12.8h, v12.8h, v14.8h // ...........|............................*............................................................ - // mul v14.8h, v24.8h, v0.h[4] // ...........|..................................*...................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...........|...................................*..................................................... - // mls v14.8h, v24.8h, v7.h[0] // ...........|.......................................*................................................. - // sub v24.8h, v13.8h, v15.8h // ...........|.....................................*................................................... - // add v13.8h, v13.8h, v15.8h // ...........|......................................*.................................................. - // mul v15.8h, v24.8h, v0.h[4] // ...........|........................................*................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...........|.........................................*............................................... - // mls v15.8h, v24.8h, v7.h[0] // ...........|.............................................*........................................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // ...........|.................*....................................................................... - // srshr v25.8h, v25.8h, #11 // ...........|.....................*................................................................... - // mls v8.8h, v25.8h, v7.h[0] // ...........|........................*................................................................ - // sqdmulh v25.8h, v12.8h, v7.h[1] // ...........|..........................................*.............................................. - // srshr v25.8h, v25.8h, #11 // ...........|..............................................*.......................................... - // mls v12.8h, v25.8h, v7.h[0] // ...........|.................................................*....................................... - // sub v24.8h, v8.8h, v12.8h // ...........|.....................................................*................................... - // add v8.8h, v8.8h, v12.8h // ...........|......................................................*.................................. - // mul v12.8h, v24.8h, v0.h[0] // ...........|........................................................*................................ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|.........................................................*............................... - // mls v12.8h, v24.8h, v7.h[0] // ...........|.............................................................*........................... - // sub v24.8h, v9.8h, v13.8h // ...........|...........................................*............................................. - // add v9.8h, v9.8h, v13.8h // ...........|............................................*............................................ - // mul v13.8h, v24.8h, v0.h[0] // ...........|...............................................*......................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|................................................*........................................ - // mls v13.8h, v24.8h, v7.h[0] // ...........|....................................................*.................................... - // sub v24.8h, v10.8h, v14.8h // ...........|..................................................*...................................... - // add v10.8h, v10.8h, v14.8h // ...........|...................................................*..................................... - // mul v14.8h, v24.8h, v0.h[0] // ...........|.......................................................*................................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|..........................................................*.............................. - // mls v14.8h, v24.8h, v7.h[0] // ...........|..............................................................*.......................... - // sub v24.8h, v11.8h, v15.8h // ...........|...........................................................*............................. - // add v11.8h, v11.8h, v15.8h // ...........|............................................................*............................ - // mul v15.8h, v24.8h, v0.h[0] // ...........|...............................................................*......................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|................................................................*........................ - // mls v15.8h, v24.8h, v7.h[0] // ...........|....................................................................*.................... - // str q12, [x0, #(4*(512/8))] // ...........|.................................................................*....................... - // str q13, [x0, #(5*(512/8))] // ...........|...................................................................*..................... - // str q14, [x0, #(6*(512/8))] // ...........|.....................................................................*................... - // str q15, [x0, #(7*(512/8))] // ...........|........................................................................*................ - // mul v12.8h, v8.8h, v29.8h // ...........|..................................................................*...................... - // sqrdmulh v8.8h, v8.8h, v30.8h // ...........|......................................................................*.................. - // mls v12.8h, v8.8h, v7.h[0] // ...........|..........................................................................*.............. - // mul v13.8h, v9.8h, v29.8h // ...........|.......................................................................*................. - // sqrdmulh v9.8h, v9.8h, v30.8h // ...........|.........................................................................*............... - // mls v13.8h, v9.8h, v7.h[0] // ...........|.............................................................................*........... - // mul v14.8h, v10.8h, v29.8h // ...........|...........................................................................*............. - // sqrdmulh v10.8h, v10.8h, v30.8h // ...........|............................................................................*............ - // mls v14.8h, v10.8h, v7.h[0] // ...........|................................................................................*........ - // mul v15.8h, v11.8h, v29.8h // ...........|..............................................................................*.......... - // sqrdmulh v11.8h, v11.8h, v30.8h // ...........|...............................................................................*......... - // mls v15.8h, v11.8h, v7.h[0] // .*.........|...................................................................................*..... - // str q12, [x0], #(16) // ...........|.................................................................................*....... - // str q13, [x0, #(-16 + 1*(512/8))] // ..*........|....................................................................................*.... - // str q14, [x0, #(-16 + 2*(512/8))] // ....*......|......................................................................................*.. - // str q15, [x0, #(-16 + 3*(512/8))] // ......*....|........................................................................................* + // ldr q8, [x0, #0] // ...........*................................................................................... + // ldr q9, [x0, #(1*(512/8))] // e..........|............................................................................e...... + // ldr q10, [x0, #(2*(512/8))] // ...e.......|...............................................................................e... + // ldr q11, [x0, #(3*(512/8))] // .....e.....|.................................................................................e. + // ldr q12, [x0, #(4*(512/8))] // .......e...|................................................................................... + // ldr q13, [x0, #(5*(512/8))] // ........e..|................................................................................... + // ldr q14, [x0, #(6*(512/8))] // .........e.|................................................................................... + // ldr q15, [x0, #(7*(512/8))] // ..........e|................................................................................... + // sub v24.8h, v8.8h, v9.8h // ...........|..*................................................................................ + // add v8.8h, v8.8h, v9.8h // ...........|...*............................................................................... + // mul v9.8h, v24.8h, v0.h[6] // ...........|........*.......................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // ...........|.........*......................................................................... + // mls v9.8h, v24.8h, v7.h[0] // ...........|.............*..................................................................... + // sub v24.8h, v10.8h, v11.8h // ...........|*.................................................................................. + // add v10.8h, v10.8h, v11.8h // ...........|.*................................................................................. + // mul v11.8h, v24.8h, v1.h[0] // ...........|....*.............................................................................. + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ...........|.....*............................................................................. + // mls v11.8h, v24.8h, v7.h[0] // ...........|..........*........................................................................ + // sub v24.8h, v12.8h, v13.8h // ...........|...........*....................................................................... + // add v12.8h, v12.8h, v13.8h // ...........|............*...................................................................... + // mul v13.8h, v24.8h, v1.h[2] // ...........|..............*.................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // ...........|...................*............................................................... + // mls v13.8h, v24.8h, v7.h[0] // ...........|.......................*........................................................... + // sub v24.8h, v14.8h, v15.8h // ...........|....................*.............................................................. + // add v14.8h, v14.8h, v15.8h // ...........|...............*................................................................... + // mul v15.8h, v24.8h, v1.h[4] // ...........|..........................*........................................................ + // sqrdmulh v24.8h, v24.8h, v1.h[5] // ...........|...........................*....................................................... + // mls v15.8h, v24.8h, v7.h[0] // ...........|...............................*................................................... + // sub v24.8h, v8.8h, v10.8h // ...........|......*............................................................................ + // add v8.8h, v8.8h, v10.8h // ...........|.......*........................................................................... + // mul v10.8h, v24.8h, v0.h[2] // ...........|................*.................................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........|.................*................................................................. + // mls v10.8h, v24.8h, v7.h[0] // ...........|.............................*..................................................... + // sub v24.8h, v9.8h, v11.8h // ...........|........................*.......................................................... + // add v9.8h, v9.8h, v11.8h // ...........|.........................*......................................................... + // mul v11.8h, v24.8h, v0.h[2] // ...........|..............................*.................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........|................................*.................................................. + // mls v11.8h, v24.8h, v7.h[0] // ...........|....................................*.............................................. + // sub v24.8h, v12.8h, v14.8h // ...........|............................*...................................................... + // add v12.8h, v12.8h, v14.8h // ...........|..................*................................................................ + // mul v14.8h, v24.8h, v0.h[4] // ...........|.................................*................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...........|..................................*................................................ + // mls v14.8h, v24.8h, v7.h[0] // ...........|........................................*.......................................... + // sub v24.8h, v13.8h, v15.8h // ...........|...................................*............................................... + // add v13.8h, v13.8h, v15.8h // ...........|.....................................*............................................. + // mul v15.8h, v24.8h, v0.h[4] // ...........|......................................*............................................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...........|.......................................*........................................... + // mls v15.8h, v24.8h, v7.h[0] // ...........|.............................................*..................................... + // sub v24.8h, v8.8h, v12.8h // ...........|......................*............................................................ + // add v8.8h, v8.8h, v12.8h // ...........|.....................*............................................................. + // mul v12.8h, v24.8h, v0.h[0] // ...........|.........................................*......................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|..........................................*........................................ + // mls v12.8h, v24.8h, v7.h[0] // ...........|.................................................*................................. + // sub v24.8h, v9.8h, v13.8h // ...........|............................................*...................................... + // add v9.8h, v9.8h, v13.8h // ...........|...........................................*....................................... + // mul v13.8h, v24.8h, v0.h[0] // ...........|..................................................*................................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|...............................................*................................... + // mls v13.8h, v24.8h, v7.h[0] // ...........|......................................................*............................ + // sub v24.8h, v10.8h, v14.8h // ...........|...................................................*............................... + // add v10.8h, v10.8h, v14.8h // ...........|....................................................*.............................. + // mul v14.8h, v24.8h, v0.h[0] // ...........|.......................................................*........................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|........................................................*.......................... + // mls v14.8h, v24.8h, v7.h[0] // ...........|..............................................................*.................... + // sub v24.8h, v11.8h, v15.8h // ...........|.....................................................*............................. + // add v11.8h, v11.8h, v15.8h // ...........|.........................................................*......................... + // mul v15.8h, v24.8h, v0.h[0] // ...........|...........................................................*....................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|............................................................*...................... + // mls v15.8h, v24.8h, v7.h[0] // ...........|................................................................*.................. + // str q12, [x0, #(4*(512/8))] // ...........|.............................................................*..................... + // str q13, [x0, #(5*(512/8))] // ...........|..........................................................*........................ + // str q14, [x0, #(6*(512/8))] // ...........|..................................................................*................ + // str q15, [x0, #(7*(512/8))] // ...........|....................................................................*.............. + // mul v12.8h, v8.8h, v29.8h // ...........|..............................................*.................................... + // sqrdmulh v8.8h, v8.8h, v30.8h // ...........|................................................*.................................. + // mls v12.8h, v8.8h, v7.h[0] // ...........|...............................................................*................... + // mul v13.8h, v9.8h, v29.8h // ...........|.................................................................*................. + // sqrdmulh v9.8h, v9.8h, v30.8h // ...........|...................................................................*............... + // mls v13.8h, v9.8h, v7.h[0] // ...........|........................................................................*.......... + // mul v14.8h, v10.8h, v29.8h // ...........|.....................................................................*............. + // sqrdmulh v10.8h, v10.8h, v30.8h // ...........|.......................................................................*........... + // mls v14.8h, v10.8h, v7.h[0] // ...........|...........................................................................*....... + // mul v15.8h, v11.8h, v29.8h // ...........|.........................................................................*......... + // sqrdmulh v11.8h, v11.8h, v30.8h // ...........|..........................................................................*........ + // mls v15.8h, v11.8h, v7.h[0] // .*.........|.............................................................................*..... + // str q12, [x0], #(16) // ...........|......................................................................*............ + // str q13, [x0, #(-16 + 1*(512/8))] // ..*........|..............................................................................*.... + // str q14, [x0, #(-16 + 2*(512/8))] // ....*......|................................................................................*.. + // str q15, [x0, #(-16 + 3*(512/8))] // ......*....|..................................................................................* sub count, count, #1 cbnz count, layer123_start - ldr q16, [x0, #0] // *...................................................................................... - // gap // ....................................................................................... - // gap // ....................................................................................... - // gap // ....................................................................................... - add v5.8H, v9.8H, v21.8H // .....................*................................................................. - // gap // ....................................................................................... - add v31.8H, v12.8H, v18.8H // ..*.................................................................................... - // gap // ....................................................................................... - add v27.8H, v16.8H, v17.8H // ....*.................................................................................. - // gap // ....................................................................................... - sub v13.8H, v16.8H, v17.8H // ...*................................................................................... - // gap // ....................................................................................... - add v6.8H, v28.8H, v4.8H // .............*......................................................................... - // gap // ....................................................................................... - sub v16.8H, v27.8H, v31.8H // .......*............................................................................... - // gap // ....................................................................................... - sub v15.8H, v12.8H, v18.8H // .*..................................................................................... - // gap // ....................................................................................... - sub v17.8H, v6.8H, v5.8H // ............................*.......................................................... - // gap // ....................................................................................... - mul v8.8H, v16.8H, v0.H[2] // ................*...................................................................... - // gap // ....................................................................................... - sqrdmulh v23.8H, v16.8H, v0.H[3] // .................*..................................................................... - // gap // ....................................................................................... - mul v16.8H, v17.8H, v0.H[4] // ...................................*................................................... - // gap // ....................................................................................... - sqrdmulh v3.8H, v17.8H, v0.H[5] // ....................................*.................................................. - // gap // ....................................................................................... - mul v17.8H, v15.8H, v1.H[0] // .....*................................................................................. - // gap // ....................................................................................... - sqrdmulh v14.8H, v15.8H, v1.H[1] // ......*................................................................................ - // gap // ....................................................................................... - mls v8.8H, v23.8H, v7.H[0] // ...............................*....................................................... - // gap // ....................................................................................... - mls v16.8H, v3.8H, v7.H[0] // ........................................*.............................................. - // gap // ....................................................................................... - sub v26.8H, v28.8H, v4.8H // ............*.......................................................................... - // gap // ....................................................................................... - mls v17.8H, v14.8H, v7.H[0] // ...........*........................................................................... - // gap // ....................................................................................... - mul v28.8H, v13.8H, v0.H[6] // .........*............................................................................. - // gap // ....................................................................................... - sub v23.8H, v8.8H, v16.8H // ...................................................*................................... - // gap // ....................................................................................... - mul v4.8H, v26.8H, v1.H[2] // ...............*....................................................................... - // gap // ....................................................................................... - sqrdmulh v12.8H, v26.8H, v1.H[3] // ...................*................................................................... - // gap // ....................................................................................... - mul v18.8H, v23.8H, v0.H[0] // ........................................................*.............................. - // gap // ....................................................................................... - sqrdmulh v23.8H, v23.8H, v0.H[1] // ...........................................................*........................... - // gap // ....................................................................................... - sqrdmulh v26.8H, v13.8H, v0.H[7] // ..........*............................................................................ - // gap // ....................................................................................... - mls v4.8H, v12.8H, v7.H[0] // .......................*............................................................... - // gap // ....................................................................................... - sub v19.8H, v9.8H, v21.8H // ....................*.................................................................. - // gap // ....................................................................................... - mls v18.8H, v23.8H, v7.H[0] // ...............................................................*....................... - // gap // ....................................................................................... - mls v28.8H, v26.8H, v7.H[0] // ..............*........................................................................ - // gap // ....................................................................................... - sqrdmulh v12.8H, v19.8H, v1.H[5] // ..............................*........................................................ - // gap // ....................................................................................... - mul v19.8H, v19.8H, v1.H[4] // ...........................*........................................................... - // gap // ....................................................................................... - str q18, [x0, #384] // ......................................................................*................ - // gap // ....................................................................................... - sub v26.8H, v28.8H, v17.8H // ........................*.............................................................. - // gap // ....................................................................................... - add v5.8H, v6.8H, v5.8H // .............................*......................................................... - // gap // ....................................................................................... - mls v19.8H, v12.8H, v7.H[0] // ..................................*.................................................... - // gap // ....................................................................................... - sqrdmulh v6.8H, v26.8H, v0.H[3] // .................................*..................................................... - // gap // ....................................................................................... - sqdmulh v21.8H, v5.8H, v7.H[1] // ...........................................*........................................... - // gap // ....................................................................................... - add v2.8H, v27.8H, v31.8H // ........*.............................................................................. - // gap // ....................................................................................... - sub v23.8H, v4.8H, v19.8H // ......................................*................................................ - // gap // ....................................................................................... - mul v3.8H, v26.8H, v0.H[2] // ................................*...................................................... - // gap // ....................................................................................... - sqdmulh v26.8H, v2.8H, v7.H[1] // ..................*.................................................................... - // gap // ....................................................................................... - sqrdmulh v22.8H, v23.8H, v0.H[5] // ..........................................*............................................ - // gap // ....................................................................................... - mul v14.8H, v23.8H, v0.H[4] // .........................................*............................................. - // gap // ....................................................................................... - srshr v18.8H, v21.8H, #11 // ...............................................*....................................... - // gap // ....................................................................................... - srshr v12.8H, v26.8H, #11 // ......................*................................................................ - // gap // ....................................................................................... - mls v3.8H, v6.8H, v7.H[0] // .....................................*................................................. - // gap // ....................................................................................... - mls v14.8H, v22.8H, v7.H[0] // ..............................................*........................................ - // gap // ....................................................................................... - mls v5.8H, v18.8H, v7.H[0] // ..................................................*.................................... - // gap // ....................................................................................... - mls v2.8H, v12.8H, v7.H[0] // .........................*............................................................. - // gap // ....................................................................................... - add v6.8H, v28.8H, v17.8H // ..........................*............................................................ - // gap // ....................................................................................... - sub v28.8H, v3.8H, v14.8H // ............................................................*.......................... - // gap // ....................................................................................... - add v9.8H, v4.8H, v19.8H // .......................................*............................................... - // gap // ....................................................................................... - add v12.8H, v2.8H, v5.8H // .......................................................*............................... - // gap // ....................................................................................... - sqrdmulh v23.8H, v28.8H, v0.H[1] // .................................................................*..................... - // gap // ....................................................................................... - mul v28.8H, v28.8H, v0.H[0] // ................................................................*...................... - // gap // ....................................................................................... - sqrdmulh v17.8H, v12.8H, v30.8H // .......................................................................*............... - // gap // ....................................................................................... - sub v5.8H, v2.8H, v5.8H // ......................................................*................................ - // gap // ....................................................................................... - mul v18.8H, v12.8H, v29.8H // ...................................................................*................... - // gap // ....................................................................................... - mls v28.8H, v23.8H, v7.H[0] // .....................................................................*................. - // gap // ....................................................................................... - sqrdmulh v4.8H, v5.8H, v0.H[1] // ..........................................................*............................ - // gap // ....................................................................................... - mul v19.8H, v5.8H, v0.H[0] // .........................................................*............................. - // gap // ....................................................................................... - add v26.8H, v6.8H, v9.8H // .............................................*......................................... - // gap // ....................................................................................... - str q28, [x0, #448] // .........................................................................*............. - // gap // ....................................................................................... - add v28.8H, v8.8H, v16.8H // ....................................................*.................................. - // gap // ....................................................................................... - sqrdmulh v8.8H, v26.8H, v30.8H // ..........................................................................*............ - // gap // ....................................................................................... - mul v12.8H, v26.8H, v29.8H // ........................................................................*.............. - // gap // ....................................................................................... - mul v26.8H, v28.8H, v29.8H // ............................................................................*.......... - // gap // ....................................................................................... - sqrdmulh v23.8H, v28.8H, v30.8H // .............................................................................*......... - // gap // ....................................................................................... - sub v11.8H, v6.8H, v9.8H // ............................................*.......................................... - // gap // ....................................................................................... - mls v12.8H, v8.8H, v7.H[0] // ..............................................................................*........ - // gap // ....................................................................................... - add v3.8H, v3.8H, v14.8H // .............................................................*......................... - // gap // ....................................................................................... - mls v26.8H, v23.8H, v7.H[0] // .................................................................................*..... - // gap // ....................................................................................... - mul v28.8H, v11.8H, v0.H[0] // ................................................*...................................... - // gap // ....................................................................................... - str q12, [x0, #64] // ....................................................................................*.. - // gap // ....................................................................................... - sqrdmulh v23.8H, v11.8H, v0.H[1] // .................................................*..................................... - // gap // ....................................................................................... - str q26, [x0, #128] // .....................................................................................*. - // gap // ....................................................................................... - mls v19.8H, v4.8H, v7.H[0] // ..............................................................*........................ - // gap // ....................................................................................... - mul v12.8H, v3.8H, v29.8H // ...............................................................................*....... - // gap // ....................................................................................... - mls v28.8H, v23.8H, v7.H[0] // .....................................................*................................. - // gap // ....................................................................................... - sqrdmulh v8.8H, v3.8H, v30.8H // ................................................................................*...... - // gap // ....................................................................................... - str q19, [x0, #256] // ..................................................................*.................... - // gap // ....................................................................................... - mls v18.8H, v17.8H, v7.H[0] // ...........................................................................*........... - // gap // ....................................................................................... - str q28, [x0, #320] // ....................................................................*.................. - // gap // ....................................................................................... - mls v12.8H, v8.8H, v7.H[0] // ...................................................................................*... - // gap // ....................................................................................... - // gap // ....................................................................................... - // gap // ....................................................................................... - str q18, [x0], #(16) // ..................................................................................*.... - // gap // ....................................................................................... - // gap // ....................................................................................... - // gap // ....................................................................................... - str q12, [x0, #176] // ......................................................................................* - // gap // ....................................................................................... + ldr q4, [x0, #0] // *................................................................................ + // gap // ................................................................................. + // gap // ................................................................................. + // gap // ................................................................................. + sub v9.8H, v22.8H, v28.8H // .*............................................................................... + // gap // ................................................................................. + sub v5.8H, v3.8H, v26.8H // ............*.................................................................... + // gap // ................................................................................. + sub v6.8H, v4.8H, v24.8H // ...*............................................................................. + // gap // ................................................................................. + sqrdmulh v15.8H, v9.8H, v1.H[1] // ......*.......................................................................... + // gap // ................................................................................. + sub v20.8H, v11.8H, v14.8H // .....................*........................................................... + // gap // ................................................................................. + mul v19.8H, v6.8H, v0.H[6] // .........*....................................................................... + // gap // ................................................................................. + mul v23.8H, v5.8H, v1.H[2] // ...............*................................................................. + // gap // ................................................................................. + mul v27.8H, v20.8H, v1.H[4] // ...........................*..................................................... + // gap // ................................................................................. + sqrdmulh v21.8H, v20.8H, v1.H[5] // ............................*.................................................... + // gap // ................................................................................. + sqrdmulh v10.8H, v5.8H, v1.H[3] // ....................*............................................................ + // gap // ................................................................................. + mul v8.8H, v9.8H, v1.H[0] // .....*........................................................................... + // gap // ................................................................................. + sqrdmulh v9.8H, v6.8H, v0.H[7] // ..........*...................................................................... + // gap // ................................................................................. + mls v27.8H, v21.8H, v7.H[0] // ................................*................................................ + // gap // ................................................................................. + mls v23.8H, v10.8H, v7.H[0] // ........................*........................................................ + // gap // ................................................................................. + mls v8.8H, v15.8H, v7.H[0] // ...........*..................................................................... + // gap // ................................................................................. + mls v19.8H, v9.8H, v7.H[0] // ..............*.................................................................. + // gap // ................................................................................. + add v16.8H, v11.8H, v14.8H // ................*................................................................ + // gap // ................................................................................. + sub v20.8H, v23.8H, v27.8H // ....................................*............................................ + // gap // ................................................................................. + add v14.8H, v23.8H, v27.8H // ......................................*.......................................... + // gap // ................................................................................. + add v5.8H, v19.8H, v8.8H // ..........................*...................................................... + // gap // ................................................................................. + sub v15.8H, v19.8H, v8.8H // .........................*....................................................... + // gap // ................................................................................. + add v10.8H, v3.8H, v26.8H // .............*................................................................... + // gap // ................................................................................. + sub v27.8H, v5.8H, v14.8H // .............................................*................................... + // gap // ................................................................................. + add v9.8H, v22.8H, v28.8H // ..*.............................................................................. + // gap // ................................................................................. + add v4.8H, v4.8H, v24.8H // ....*............................................................................ + // gap // ................................................................................. + sqrdmulh v19.8H, v27.8H, v0.H[1] // ................................................*................................ + // gap // ................................................................................. + mul v23.8H, v27.8H, v0.H[0] // ...................................................*............................. + // gap // ................................................................................. + sqrdmulh v18.8H, v15.8H, v0.H[3] // .................................*............................................... + // gap // ................................................................................. + add v3.8H, v10.8H, v16.8H // ...................*............................................................. + // gap // ................................................................................. + add v24.8H, v4.8H, v9.8H // ........*........................................................................ + // gap // ................................................................................. + mls v23.8H, v19.8H, v7.H[0] // .......................................................*......................... + // gap // ................................................................................. + sqrdmulh v26.8H, v20.8H, v0.H[5] // ........................................*........................................ + // gap // ................................................................................. + sub v22.8H, v24.8H, v3.8H // .......................*......................................................... + // gap // ................................................................................. + mul v11.8H, v20.8H, v0.H[4] // .......................................*......................................... + // gap // ................................................................................. + str q23, [x0, #320] // ...........................................................*..................... + // gap // ................................................................................. + sqrdmulh v19.8H, v22.8H, v0.H[1] // ...........................................*..................................... + // gap // ................................................................................. + mul v25.8H, v22.8H, v0.H[0] // ..........................................*...................................... + // gap // ................................................................................. + mls v11.8H, v26.8H, v7.H[0] // ..............................................*.................................. + // gap // ................................................................................. + mul v20.8H, v15.8H, v0.H[2] // ...............................*................................................. + // gap // ................................................................................. + sub v12.8H, v10.8H, v16.8H // .............................*................................................... + // gap // ................................................................................. + mls v25.8H, v19.8H, v7.H[0] // ..................................................*.............................. + // gap // ................................................................................. + add v23.8H, v5.8H, v14.8H // ............................................*.................................... + // gap // ................................................................................. + mls v20.8H, v18.8H, v7.H[0] // .....................................*........................................... + // gap // ................................................................................. + mul v5.8H, v12.8H, v0.H[4] // ..................................*.............................................. + // gap // ................................................................................. + str q25, [x0, #256] // ..............................................................*.................. + // gap // ................................................................................. + mul v28.8H, v23.8H, v29.8H // ..................................................................*.............. + // gap // ................................................................................. + add v27.8H, v20.8H, v11.8H // ..........................................................*...................... + // gap // ................................................................................. + sqrdmulh v19.8H, v23.8H, v30.8H // ....................................................................*............ + // gap // ................................................................................. + sqrdmulh v22.8H, v12.8H, v0.H[5] // ...................................*............................................. + // gap // ................................................................................. + mul v23.8H, v27.8H, v29.8H // ..........................................................................*...... + // gap // ................................................................................. + sqrdmulh v14.8H, v27.8H, v30.8H // ...........................................................................*..... + // gap // ................................................................................. + mls v28.8H, v19.8H, v7.H[0] // .........................................................................*....... + // gap // ................................................................................. + mls v5.8H, v22.8H, v7.H[0] // .........................................*....................................... + // gap // ................................................................................. + sub v21.8H, v4.8H, v9.8H // .......*......................................................................... + // gap // ................................................................................. + mls v23.8H, v14.8H, v7.H[0] // .............................................................................*... + // gap // ................................................................................. + str q28, [x0, #64] // ..............................................................................*.. + // gap // ................................................................................. + mul v8.8H, v21.8H, v0.H[2] // .................*............................................................... + // gap // ................................................................................. + sqrdmulh v19.8H, v21.8H, v0.H[3] // ..................*.............................................................. + // gap // ................................................................................. + str q23, [x0, #192] // ................................................................................* + // gap // ................................................................................. + sub v27.8H, v20.8H, v11.8H // ......................................................*.......................... + // gap // ................................................................................. + add v25.8H, v24.8H, v3.8H // ......................*.......................................................... + // gap // ................................................................................. + mls v8.8H, v19.8H, v7.H[0] // ..............................*.................................................. + // gap // ................................................................................. + mul v3.8H, v27.8H, v0.H[0] // ............................................................*.................... + // gap // ................................................................................. + sqrdmulh v19.8H, v27.8H, v0.H[1] // .............................................................*................... + // gap // ................................................................................. + sqrdmulh v27.8H, v25.8H, v30.8H // .................................................*............................... + // gap // ................................................................................. + add v23.8H, v8.8H, v5.8H // .....................................................*........................... + // gap // ................................................................................. + sub v26.8H, v8.8H, v5.8H // ....................................................*............................ + // gap // ................................................................................. + mls v3.8H, v19.8H, v7.H[0] // .................................................................*............... + // gap // ................................................................................. + sqrdmulh v19.8H, v23.8H, v30.8H // ........................................................................*........ + // gap // ................................................................................. + mul v22.8H, v23.8H, v29.8H // ......................................................................*.......... + // gap // ................................................................................. + sqrdmulh v28.8H, v26.8H, v0.H[1] // .........................................................*....................... + // gap // ................................................................................. + mul v24.8H, v26.8H, v0.H[0] // ........................................................*........................ + // gap // ................................................................................. + mul v23.8H, v25.8H, v29.8H // ...............................................*................................. + // gap // ................................................................................. + mls v22.8H, v19.8H, v7.H[0] // ............................................................................*.... + // gap // ................................................................................. + str q3, [x0, #448] // .....................................................................*........... + // gap // ................................................................................. + mls v24.8H, v28.8H, v7.H[0] // ...............................................................*................. + // gap // ................................................................................. + mls v23.8H, v27.8H, v7.H[0] // ................................................................*................ + // gap // ................................................................................. + str q22, [x0, #128] // ...............................................................................*. + // gap // ................................................................................. + // gap // ................................................................................. + // gap // ................................................................................. + str q24, [x0, #384] // ...................................................................*............. + // gap // ................................................................................. + // gap // ................................................................................. + // gap // ................................................................................. + str q23, [x0], #(16) // .......................................................................*......... + // gap // ................................................................................. // original source code - // ldr q8, [x0, #0] // *...................................................................................... - // sub v23.8H, v12.8H, v18.8H // .......*............................................................................... - // add v12.8H, v12.8H, v18.8H // ..*.................................................................................... - // sub v18.8H, v8.8H, v17.8H // ....*.................................................................................. - // add v8.8H, v8.8H, v17.8H // ...*................................................................................... - // mul v26.8H, v23.8H, v1.H[0] // .............*......................................................................... - // sqrdmulh v23.8H, v23.8H, v1.H[1] // ..............*........................................................................ - // sub v17.8H, v8.8H, v12.8H // ......*................................................................................ - // add v8.8H, v8.8H, v12.8H // ......................................*................................................ - // mul v12.8H, v18.8H, v0.H[6] // ...................*................................................................... - // sqrdmulh v18.8H, v18.8H, v0.H[7] // .........................*............................................................. - // mls v26.8H, v23.8H, v7.H[0] // ..................*.................................................................... - // sub v23.8H, v28.8H, v4.8H // .................*..................................................................... - // add v28.8H, v28.8H, v4.8H // .....*................................................................................. - // mls v12.8H, v18.8H, v7.H[0] // .............................*......................................................... - // mul v18.8H, v23.8H, v1.H[2] // .....................*................................................................. - // mul v4.8H, v17.8H, v0.H[2] // .........*............................................................................. - // sqrdmulh v17.8H, v17.8H, v0.H[3] // ..........*............................................................................ - // sqdmulh v19.8H, v8.8H, v7.H[1] // .........................................*............................................. - // sqrdmulh v23.8H, v23.8H, v1.H[3] // ......................*................................................................ - // sub v5.8H, v9.8H, v21.8H // ...........................*........................................................... - // add v9.8H, v9.8H, v21.8H // .*..................................................................................... - // srshr v19.8H, v19.8H, #11 // .............................................*......................................... - // mls v18.8H, v23.8H, v7.H[0] // ..........................*............................................................ - // sub v23.8H, v12.8H, v26.8H // .................................*..................................................... - // mls v8.8H, v19.8H, v7.H[0] // .................................................*..................................... - // add v12.8H, v12.8H, v26.8H // ..................................................*.................................... - // mul v26.8H, v5.8H, v1.H[4] // ...............................*....................................................... - // sub v19.8H, v28.8H, v9.8H // ........*.............................................................................. - // add v28.8H, v28.8H, v9.8H // ..................................*.................................................... - // sqrdmulh v5.8H, v5.8H, v1.H[5] // ..............................*........................................................ - // mls v4.8H, v17.8H, v7.H[0] // ...............*....................................................................... - // mul v17.8H, v23.8H, v0.H[2] // ........................................*.............................................. - // sqrdmulh v23.8H, v23.8H, v0.H[3] // ....................................*.................................................. - // mls v26.8H, v5.8H, v7.H[0] // ...................................*................................................... - // mul v5.8H, v19.8H, v0.H[4] // ...........*........................................................................... - // sqrdmulh v19.8H, v19.8H, v0.H[5] // ............*.......................................................................... - // mls v17.8H, v23.8H, v7.H[0] // ..............................................*........................................ - // sub v23.8H, v18.8H, v26.8H // .......................................*............................................... - // add v18.8H, v18.8H, v26.8H // ....................................................*.................................. - // mls v5.8H, v19.8H, v7.H[0] // ................*...................................................................... - // mul v26.8H, v23.8H, v0.H[4] // ...........................................*........................................... - // sqrdmulh v23.8H, v23.8H, v0.H[5] // ..........................................*............................................ - // sqdmulh v19.8H, v28.8H, v7.H[1] // .....................................*................................................. - // sub v9.8H, v12.8H, v18.8H // .....................................................................*................. - // add v12.8H, v12.8H, v18.8H // ..............................................................*........................ - // mls v26.8H, v23.8H, v7.H[0] // ...............................................*....................................... - // srshr v23.8H, v19.8H, #11 // ............................................*.......................................... - // mul v18.8H, v9.8H, v0.H[0] // .........................................................................*............. - // sqrdmulh v19.8H, v9.8H, v0.H[1] // ...........................................................................*........... - // mls v28.8H, v23.8H, v7.H[0] // ................................................*...................................... - // sub v23.8H, v4.8H, v5.8H // ....................*.................................................................. - // add v4.8H, v4.8H, v5.8H // ................................................................*...................... - // mls v18.8H, v19.8H, v7.H[0] // ...............................................................................*....... - // sub v19.8H, v8.8H, v28.8H // .........................................................*............................. - // add v8.8H, v8.8H, v28.8H // .....................................................*................................. - // mul v28.8H, v23.8H, v0.H[0] // .......................*............................................................... - // mul v5.8H, v19.8H, v0.H[0] // .............................................................*......................... - // sqrdmulh v19.8H, v19.8H, v0.H[1] // ............................................................*.......................... - // sqrdmulh v23.8H, v23.8H, v0.H[1] // ........................*.............................................................. - // sub v9.8H, v17.8H, v26.8H // ...................................................*................................... - // add v26.8H, v17.8H, v26.8H // .......................................................................*............... - // mls v5.8H, v19.8H, v7.H[0] // .............................................................................*......... - // mls v28.8H, v23.8H, v7.H[0] // ............................*.......................................................... - // mul v23.8H, v9.8H, v0.H[0] // .......................................................*............................... - // sqrdmulh v17.8H, v9.8H, v0.H[1] // ......................................................*................................ - // str q5, [x0, #256] // .................................................................................*..... - // mul v19.8H, v8.8H, v29.8H // ..........................................................*............................ - // str q18, [x0, #320] // ...................................................................................*... - // mls v23.8H, v17.8H, v7.H[0] // ...........................................................*........................... - // str q28, [x0, #384] // ................................*...................................................... - // sqrdmulh v8.8H, v8.8H, v30.8H // ........................................................*.............................. - // mul v18.8H, v12.8H, v29.8H // ..................................................................*.................... - // str q23, [x0, #448] // ...............................................................*....................... - // sqrdmulh v23.8H, v12.8H, v30.8H // .................................................................*..................... - // mls v19.8H, v8.8H, v7.H[0] // ..................................................................................*.... - // mul v8.8H, v4.8H, v29.8H // ...................................................................*................... - // sqrdmulh v12.8H, v4.8H, v30.8H // ....................................................................*.................. - // mls v18.8H, v23.8H, v7.H[0] // ......................................................................*................ - // mul v23.8H, v26.8H, v29.8H // ..............................................................................*........ - // sqrdmulh v26.8H, v26.8H, v30.8H // ................................................................................*...... - // mls v8.8H, v12.8H, v7.H[0] // ........................................................................*.............. - // str q19, [x0], #(16) // .....................................................................................*. - // mls v23.8H, v26.8H, v7.H[0] // ....................................................................................*.. - // str q18, [x0, #48] // ..........................................................................*............ - // str q8, [x0, #112] // ............................................................................*.......... - // str q23, [x0, #176] // ......................................................................................* + // ldr q23, [x0, #0] // *................................................................................ + // sub v19.8H, v22.8H, v28.8H // .*............................................................................... + // add v22.8H, v22.8H, v28.8H // ........................*........................................................ + // sub v28.8H, v23.8H, v24.8H // ...*............................................................................. + // add v23.8H, v23.8H, v24.8H // .........................*....................................................... + // mul v27.8H, v19.8H, v1.H[0] // ...........*..................................................................... + // sqrdmulh v8.8H, v19.8H, v1.H[1] // ....*............................................................................ + // sub v24.8H, v23.8H, v22.8H // ......................................................*.......................... + // add v21.8H, v23.8H, v22.8H // ..............................*.................................................. + // mul v22.8H, v28.8H, v0.H[6] // ......*.......................................................................... + // sqrdmulh v28.8H, v28.8H, v0.H[7] // ............*.................................................................... + // mls v27.8H, v8.8H, v7.H[0] // ...............*................................................................. + // sub v19.8H, v3.8H, v26.8H // ..*.............................................................................. + // add v3.8H, v3.8H, v26.8H // ......................*.......................................................... + // mls v22.8H, v28.8H, v7.H[0] // ................*................................................................ + // mul v28.8H, v19.8H, v1.H[2] // .......*......................................................................... + // add v25.8H, v11.8H, v14.8H // .................*............................................................... + // mul v26.8H, v24.8H, v0.H[2] // .........................................................*....................... + // sqrdmulh v24.8H, v24.8H, v0.H[3] // ..........................................................*...................... + // add v20.8H, v3.8H, v25.8H // .............................*................................................... + // sqrdmulh v19.8H, v19.8H, v1.H[3] // ..........*...................................................................... + // sub v11.8H, v11.8H, v14.8H // .....*........................................................................... + // add v23.8H, v21.8H, v20.8H // .............................................................*................... + // sub v14.8H, v21.8H, v20.8H // .................................*............................................... + // mls v28.8H, v19.8H, v7.H[0] // ..............*.................................................................. + // sub v19.8H, v22.8H, v27.8H // .....................*........................................................... + // add v22.8H, v22.8H, v27.8H // ....................*............................................................ + // mul v27.8H, v11.8H, v1.H[4] // ........*........................................................................ + // sqrdmulh v20.8H, v11.8H, v1.H[5] // .........*....................................................................... + // sub v3.8H, v3.8H, v25.8H // ........................................*........................................ + // mls v26.8H, v24.8H, v7.H[0] // ..............................................................*.................. + // mul v24.8H, v19.8H, v0.H[2] // .......................................*......................................... + // mls v27.8H, v20.8H, v7.H[0] // .............*................................................................... + // sqrdmulh v19.8H, v19.8H, v0.H[3] // ............................*.................................................... + // mul v20.8H, v3.8H, v0.H[4] // ............................................*.................................... + // sqrdmulh v3.8H, v3.8H, v0.H[5] // .................................................*............................... + // sub v11.8H, v28.8H, v27.8H // ..................*.............................................................. + // mls v24.8H, v19.8H, v7.H[0] // ...........................................*..................................... + // add v19.8H, v28.8H, v27.8H // ...................*............................................................. + // mul v28.8H, v11.8H, v0.H[4] // ..................................*.............................................. + // sqrdmulh v27.8H, v11.8H, v0.H[5] // ................................*................................................ + // mls v20.8H, v3.8H, v7.H[0] // .....................................................*........................... + // mul v3.8H, v14.8H, v0.H[0] // .....................................*........................................... + // sqrdmulh v25.8H, v14.8H, v0.H[1] // ....................................*............................................ + // add v8.8H, v22.8H, v19.8H // ..........................................*...................................... + // sub v14.8H, v22.8H, v19.8H // .......................*......................................................... + // mls v28.8H, v27.8H, v7.H[0] // ......................................*.......................................... + // mul v22.8H, v23.8H, v29.8H // .........................................................................*....... + // sqrdmulh v11.8H, v14.8H, v0.H[1] // ..........................*...................................................... + // sqrdmulh v23.8H, v23.8H, v30.8H // .................................................................*............... + // mls v3.8H, v25.8H, v7.H[0] // .........................................*....................................... + // mul v27.8H, v14.8H, v0.H[0] // ...........................*..................................................... + // sub v14.8H, v26.8H, v20.8H // ...................................................................*............. + // add v26.8H, v26.8H, v20.8H // ..................................................................*.............. + // sub v20.8H, v24.8H, v28.8H // ............................................................*.................... + // mls v27.8H, v11.8H, v7.H[0] // ...............................*................................................. + // mul v11.8H, v14.8H, v0.H[0] // ........................................................................*........ + // sqrdmulh v14.8H, v14.8H, v0.H[1] // .......................................................................*......... + // add v28.8H, v24.8H, v28.8H // ...............................................*................................. + // str q27, [x0, #320] // ...................................*............................................. + // mul v24.8H, v20.8H, v0.H[0] // ...............................................................*................. + // sqrdmulh v20.8H, v20.8H, v0.H[1] // ................................................................*................ + // str q3, [x0, #256] // .............................................*................................... + // mls v11.8H, v14.8H, v7.H[0] // ............................................................................*.... + // mls v22.8H, v23.8H, v7.H[0] // .............................................................................*... + // mls v24.8H, v20.8H, v7.H[0] // ....................................................................*............ + // mul v23.8H, v8.8H, v29.8H // ..............................................*.................................. + // str q11, [x0, #384] // ...............................................................................*. + // sqrdmulh v19.8H, v8.8H, v30.8H // ................................................*................................ + // str q24, [x0, #448] // ...........................................................................*..... + // mul v27.8H, v26.8H, v29.8H // ......................................................................*.......... + // str q22, [x0], #(16) // ................................................................................* + // sqrdmulh v24.8H, v26.8H, v30.8H // .....................................................................*........... + // mls v23.8H, v19.8H, v7.H[0] // ....................................................*............................ + // mul v19.8H, v28.8H, v29.8H // ..................................................*.............................. + // sqrdmulh v8.8H, v28.8H, v30.8H // ...................................................*............................. + // mls v27.8H, v24.8H, v7.H[0] // ..........................................................................*...... + // mls v19.8H, v8.8H, v7.H[0] // .......................................................*......................... + // str q23, [x0, #48] // ........................................................*........................ + // str q27, [x0, #112] // ..............................................................................*.. + // str q19, [x0, #176] // ...........................................................*..................... pop_stack diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_a72.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_a72.s index 67eb850..4e82bc7 100644 --- a/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_a72.s +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_a72.s @@ -354,619 +354,673 @@ _intt_kyber_123_4567_opt_a72: mov count, #8 .p2align 2 - ldr q0, [x1, #16] // *................................................. - // gap // .................................................. - ldr q29, [x1, #0] // .*................................................ - ldr q26, [x1, #32] // .....*............................................ - ldr q2, [x1, #48] // ....*............................................. - // gap // .................................................. - ldr q13, [x4], #(6*16) // ......*........................................... - ldr q16, [x4, #-64] // ......................*........................... - // gap // .................................................. - ldr q23, [x4, #-80] // ........*......................................... - // gap // .................................................. - // gap // .................................................. - trn2 v1.4S, v29.4S, v0.4S // .......*.......................................... - trn1 v0.4S, v29.4S, v0.4S // .........*........................................ - ldr q18, [x4, #-32] // ................*................................. - trn1 v8.4S, v26.4S, v2.4S // ...........*...................................... - trn2 v3.4S, v26.4S, v2.4S // ..........*....................................... - ldr q21, [x4, #-48] // ..*............................................... - ldr q24, [x4, #-16] // ...*.............................................. - // gap // .................................................. - // gap // .................................................. - ldr q4, [x3], #16 // ............................................*..... - // gap // .................................................. - // gap // .................................................. - trn2 v29.2D, v0.2D, v8.2D // .............*.................................... - trn2 v26.2D, v1.2D, v3.2D // ............*..................................... - // gap // .................................................. - trn1 v11.2D, v0.2D, v8.2D // ...............*.................................. - // gap // .................................................. - // gap // .................................................. - trn1 v30.2D, v1.2D, v3.2D // ..............*................................... - // gap // .................................................. - // gap // .................................................. - sub v2.8H, v29.8H, v26.8H // .................*................................ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v20.8H, v11.8H, v30.8H // ...................*.............................. - // gap // .................................................. - // gap // .................................................. - mul v0.8H, v2.8H, v18.8H // ....................*............................. - add v31.8H, v11.8H, v30.8H // .........................*........................ - // gap // .................................................. - add v18.8H, v29.8H, v26.8H // ..................*............................... - // gap // .................................................. - // gap // .................................................. - sqrdmulh v5.8H, v20.8H, v21.8H // .....................*............................ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - add v8.8H, v31.8H, v18.8H // ............................*..................... - sqrdmulh v30.8H, v2.8H, v24.8H // .......................*.......................... - // gap // .................................................. - sub v15.8H, v31.8H, v18.8H // ...........................*...................... - // gap // .................................................. - // gap // .................................................. - mul v17.8H, v20.8H, v16.8H // ........................*......................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v17.8H, v5.8H, v7.H[0] // ..........................*....................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v0.8H, v30.8H, v7.H[0] // .............................*.................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mul v31.8H, v15.8H, v13.8H // .................................*................ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - add v24.8H, v17.8H, v0.8H // ................................*................. - sub v21.8H, v17.8H, v0.8H // ...............................*.................. - // gap // .................................................. - sqrdmulh v0.8H, v15.8H, v23.8H // ..............................*................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - trn2 v9.4S, v8.4S, v24.4S // .......................................*.......... - sqrdmulh v16.8H, v21.8H, v23.8H // ..................................*............... - // gap // .................................................. - trn1 v20.4S, v8.4S, v24.4S // ....................................*............. - // gap // .................................................. - // gap // .................................................. - mul v26.8H, v21.8H, v13.8H // .....................................*............ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v31.8H, v0.8H, v7.H[0] // ...................................*.............. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v26.8H, v16.8H, v7.H[0] // ......................................*........... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - trn1 v23.4S, v31.4S, v26.4S // ........................................*......... - trn2 v27.4S, v31.4S, v26.4S // .........................................*........ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - trn1 v26.2D, v20.2D, v23.2D // ..........................................*....... - // gap // .................................................. - trn1 v28.2D, v9.2D, v27.2D // ...........................................*...... - trn2 v3.2D, v9.2D, v27.2D // ..............................................*... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v2.8H, v26.8H, v28.8H // .............................................*.... - add v24.8H, v26.8H, v28.8H // ...............................................*.. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sqrdmulh v26.8H, v2.8H, v4.H[3] // ................................................*. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mul v17.8H, v2.8H, v4.H[2] // .................................................* - // gap // .................................................. - // gap // .................................................. + // gap // ........................................................ + ldr q13, [x1, #48] // .*...................................................... + ldr q14, [x1, #32] // *....................................................... + ldr q25, [x1, #0] // ...*.................................................... + ldr q1, [x1, #16] // ..*..................................................... + // gap // ........................................................ + ldr q4, [x4, #80] // .......................*................................ + ldr q6, [x4], #(6*16) // ....*................................................... + // gap // ........................................................ + ldr q11, [x4, #-32] // ...............*........................................ + ldr q10, [x4, #-64] // ............*........................................... + // gap // ........................................................ + trn2 v28.4S, v14.4S, v13.4S // ......*................................................. + trn1 v23.4S, v14.4S, v13.4S // ........*............................................... + ldr q9, [x4, #-48] // ..............*......................................... + trn2 v16.4S, v25.4S, v1.4S // .......*................................................ + trn1 v5.4S, v25.4S, v1.4S // .........*.............................................. + ldr q3, [x4, #-80] // .....*.................................................. + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + trn1 v14.2D, v5.2D, v23.2D // .............*.......................................... + trn1 v12.2D, v16.2D, v28.2D // ..........*............................................. + // gap // ........................................................ + trn2 v19.2D, v16.2D, v28.2D // ...........*............................................ + // gap // ........................................................ + // gap // ........................................................ + trn2 v22.2D, v5.2D, v23.2D // ................*....................................... + // gap // ........................................................ + // gap // ........................................................ + sub v26.8H, v14.8H, v12.8H // .................*...................................... + add v13.8H, v14.8H, v12.8H // .....................*.................................. + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + sub v27.8H, v22.8H, v19.8H // ..................*..................................... + // gap // ........................................................ + // gap // ........................................................ + add v29.8H, v22.8H, v19.8H // ...................*.................................... + sqrdmulh v5.8H, v26.8H, v9.8H // ......................*................................. + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + sqrdmulh v30.8H, v27.8H, v4.8H // .........................*.............................. + // gap // ........................................................ + // gap // ........................................................ + sub v4.8H, v13.8H, v29.8H // ..........................*............................. + // gap // ........................................................ + // gap // ........................................................ + mul v24.8H, v26.8H, v10.8H // ....................*................................... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + mls v24.8H, v5.8H, v7.H[0] // ...........................*............................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + mul v10.8H, v27.8H, v11.8H // ............................*........................... + add v27.8H, v13.8H, v29.8H // ........................*............................... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + mls v10.8H, v30.8H, v7.H[0] // .............................*.......................... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + sqrdmulh v15.8H, v4.8H, v3.8H // ..............................*......................... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + mul v25.8H, v4.8H, v6.8H // ................................*....................... + // gap // ........................................................ + // gap // ........................................................ + sub v4.8H, v24.8H, v10.8H // ...............................*........................ + // gap // ........................................................ + // gap // ........................................................ + add v0.8H, v24.8H, v10.8H // .....................................*.................. + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + sqrdmulh v16.8H, v4.8H, v3.8H // .................................*...................... + // gap // ........................................................ + // gap // ........................................................ + trn2 v3.4S, v27.4S, v0.4S // ......................................*................. + // gap // ........................................................ + // gap // ........................................................ + mul v12.8H, v4.8H, v6.8H // ...................................*.................... + ldr q6, [x3], #16 // ............................................*........... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + mls v25.8H, v15.8H, v7.H[0] // ..................................*..................... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + mls v12.8H, v16.8H, v7.H[0] // ....................................*................... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + trn1 v23.4S, v27.4S, v0.4S // .......................................*................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + trn2 v19.4S, v25.4S, v12.4S // ........................................*............... + trn1 v18.4S, v25.4S, v12.4S // .........................................*.............. + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + trn2 v29.2D, v3.2D, v19.2D // ..........................................*............. + trn2 v21.2D, v23.2D, v18.2D // ...........................................*............ + // gap // ........................................................ + trn1 v10.2D, v3.2D, v19.2D // .............................................*.......... + trn1 v8.2D, v23.2D, v18.2D // ..............................................*......... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + add v11.8H, v21.8H, v29.8H // ...............................................*........ + sub v1.8H, v21.8H, v29.8H // ................................................*....... + // gap // ........................................................ + sub v23.8H, v8.8H, v10.8H // .................................................*...... + add v0.8H, v8.8H, v10.8H // ..................................................*..... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + sqdmulh v22.8H, v11.8H, v7.H[1] // ...................................................*.... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + mul v2.8H, v1.8H, v6.H[4] // .......................................................* + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + sqdmulh v12.8H, v0.8H, v7.H[1] // .....................................................*.. + // gap // ........................................................ + // gap // ........................................................ + srshr v3.8H, v22.8H, #11 // ......................................................*. + // gap // ........................................................ + // gap // ........................................................ + sqrdmulh v24.8H, v23.8H, v6.H[3] // ....................................................*... + // gap // ........................................................ + // gap // ........................................................ // original source code - // ldr q12, [x1, #16] // *................................................. - // ldr q28, [x1, #0] // .*................................................ - // ldr q30, [x4, #48] // ............*..................................... - // ldr q6, [x4, #80] // .............*.................................... - // ldr q21, [x1, #48] // ...*.............................................. - // ldr q22, [x1, #32] // ..*............................................... - // ldr q9, [x4], #(6*16) // ....*............................................. - // trn2 v19.4S, v28.4S, v12.4S // .......*.......................................... - // ldr q10, [x4, #-80] // ......*........................................... - // trn1 v13.4S, v28.4S, v12.4S // ........*......................................... - // trn2 v23.4S, v22.4S, v21.4S // ...........*...................................... - // trn1 v20.4S, v22.4S, v21.4S // ..........*....................................... - // trn2 v21.2D, v19.2D, v23.2D // ................*................................. - // trn2 v16.2D, v13.2D, v20.2D // ...............*.................................. - // trn1 v1.2D, v19.2D, v23.2D // ..................*............................... - // trn1 v11.2D, v13.2D, v20.2D // .................*................................ - // ldr q23, [x4, #-32] // .........*........................................ - // sub v2.8H, v16.8H, v21.8H // ...................*.............................. - // add v16.8H, v16.8H, v21.8H // .......................*.......................... - // sub v21.8H, v11.8H, v1.8H // ....................*............................. - // mul v26.8H, v2.8H, v23.8H // .....................*............................ - // sqrdmulh v23.8H, v21.8H, v30.8H // ........................*......................... - // ldr q25, [x4, #-64] // .....*............................................ - // sqrdmulh v2.8H, v2.8H, v6.8H // ..........................*....................... - // mul v3.8H, v21.8H, v25.8H // ............................*..................... - // add v21.8H, v11.8H, v1.8H // ......................*........................... - // mls v3.8H, v23.8H, v7.H[0] // .............................*.................... - // sub v6.8H, v21.8H, v16.8H // ...........................*...................... - // add v1.8H, v21.8H, v16.8H // .........................*........................ - // mls v26.8H, v2.8H, v7.H[0] // ..............................*................... - // sqrdmulh v23.8H, v6.8H, v10.8H // ..................................*............... - // sub v16.8H, v3.8H, v26.8H // .................................*................ - // add v0.8H, v3.8H, v26.8H // ................................*................. - // mul v21.8H, v6.8H, v9.8H // ...............................*.................. - // sqrdmulh v2.8H, v16.8H, v10.8H // ....................................*............. - // mls v21.8H, v23.8H, v7.H[0] // .......................................*.......... - // trn1 v20.4S, v1.4S, v0.4S // .....................................*............ - // mul v16.8H, v16.8H, v9.8H // ......................................*........... - // mls v16.8H, v2.8H, v7.H[0] // ........................................*......... - // trn2 v2.4S, v1.4S, v0.4S // ...................................*.............. - // trn1 v23.4S, v21.4S, v16.4S // .........................................*........ - // trn2 v8.4S, v21.4S, v16.4S // ..........................................*....... - // trn1 v16.2D, v20.2D, v23.2D // ...........................................*...... - // trn1 v0.2D, v2.2D, v8.2D // ............................................*..... - // ldr q4, [x3], #16 // ..............*................................... - // sub v5.8H, v16.8H, v0.8H // ..............................................*... - // trn2 v3.2D, v2.2D, v8.2D // .............................................*.... - // add v24.8H, v16.8H, v0.8H // ...............................................*.. - // sqrdmulh v26.8H, v5.8H, v4.H[3] // ................................................*. - // mul v17.8H, v5.8H, v4.H[2] // .................................................* + // ldr q5, [x1, #32] // .*...................................................... + // ldr q22, [x1, #48] // *....................................................... + // ldr q12, [x1, #16] // ...*.................................................... + // ldr q17, [x1, #0] // ..*..................................................... + // ldr q25, [x4], #(6*16) // .....*.................................................. + // ldr q16, [x4, #-80] // .............*.......................................... + // trn2 v19.4S, v5.4S, v22.4S // ........*............................................... + // trn2 v23.4S, v17.4S, v12.4S // ...........*............................................ + // trn1 v8.4S, v5.4S, v22.4S // .........*.............................................. + // trn1 v22.4S, v17.4S, v12.4S // ............*........................................... + // trn1 v1.2D, v23.2D, v19.2D // ...............*........................................ + // trn2 v15.2D, v23.2D, v19.2D // ................*....................................... + // ldr q23, [x4, #-64] // .......*................................................ + // trn1 v27.2D, v22.2D, v8.2D // ..............*......................................... + // ldr q29, [x4, #-48] // ..........*............................................. + // ldr q14, [x4, #-32] // ......*................................................. + // trn2 v20.2D, v22.2D, v8.2D // .................*...................................... + // sub v3.8H, v27.8H, v1.8H // ..................*..................................... + // sub v21.8H, v20.8H, v15.8H // ....................*................................... + // add v15.8H, v20.8H, v15.8H // .....................*.................................. + // mul v12.8H, v3.8H, v23.8H // .........................*.............................. + // add v5.8H, v27.8H, v1.8H // ...................*.................................... + // sqrdmulh v23.8H, v3.8H, v29.8H // ......................*................................. + // ldr q28, [x4, #-16] // ....*................................................... + // add v29.8H, v5.8H, v15.8H // ............................*........................... + // sqrdmulh v18.8H, v21.8H, v28.8H // .......................*................................ + // sub v9.8H, v5.8H, v15.8H // ........................*............................... + // mls v12.8H, v23.8H, v7.H[0] // ..........................*............................. + // mul v27.8H, v21.8H, v14.8H // ...........................*............................ + // mls v27.8H, v18.8H, v7.H[0] // .............................*.......................... + // sqrdmulh v1.8H, v9.8H, v16.8H // ..............................*......................... + // sub v28.8H, v12.8H, v27.8H // ................................*....................... + // mul v31.8H, v9.8H, v25.8H // ...............................*........................ + // sqrdmulh v19.8H, v28.8H, v16.8H // ..................................*..................... + // mls v31.8H, v1.8H, v7.H[0] // ......................................*................. + // mul v1.8H, v28.8H, v25.8H // ....................................*................... + // mls v1.8H, v19.8H, v7.H[0] // .......................................*................ + // add v19.8H, v12.8H, v27.8H // .................................*...................... + // trn2 v3.4S, v29.4S, v19.4S // ...................................*.................... + // trn1 v10.4S, v29.4S, v19.4S // ........................................*............... + // trn2 v20.4S, v31.4S, v1.4S // .........................................*.............. + // trn1 v31.4S, v31.4S, v1.4S // ..........................................*............. + // trn2 v30.2D, v3.2D, v20.2D // ...........................................*............ + // trn2 v28.2D, v10.2D, v31.2D // ............................................*........... + // ldr q6, [x3], #16 // .....................................*.................. + // trn1 v17.2D, v3.2D, v20.2D // .............................................*.......... + // trn1 v5.2D, v10.2D, v31.2D // ..............................................*......... + // add v11.8H, v28.8H, v30.8H // ...............................................*........ + // sub v1.8H, v28.8H, v30.8H // ................................................*....... + // sub v23.8H, v5.8H, v17.8H // .................................................*...... + // add v0.8H, v5.8H, v17.8H // ..................................................*..... + // sqdmulh v8.8H, v11.8H, v7.H[1] // ...................................................*.... + // sqrdmulh v24.8H, v23.8H, v6.H[3] // .......................................................* + // sqdmulh v12.8H, v0.8H, v7.H[1] // .....................................................*.. + // srshr v3.8H, v8.8H, #11 // ......................................................*. + // mul v2.8H, v1.8H, v6.H[4] // ....................................................*... sub count, count, #1 layer4567_start: - trn2 v31.2D, v20.2D, v23.2D // ..........................................*.................................. - ldr q12, [x1, #80] // .e........................................................................... - ldr q28, [x1, #64] // e............................................................................ - ldr q30, [x4, #48] // ...............e............................................................. - sqdmulh v2.8H, v24.8H, v7.H[1] // .........................................................*................... - ldr q6, [x4, #80] // .................e........................................................... - ldr q21, [x1, #112] // ...e......................................................................... - // gap // ............................................................................. - ldr q22, [x1, #96] // ..e.......................................................................... - ldr q9, [x4], #(6*16) // ............e................................................................ - mls v17.8H, v26.8H, v7.H[0] // ...................................................*......................... - add v15.8H, v31.8H, v3.8H // .....................................................*....................... - trn2 v19.4S, v28.4S, v12.4S // .....e....................................................................... - ldr q10, [x4, #-80] // .............e............................................................... - // gap // ............................................................................. - trn1 v13.4S, v28.4S, v12.4S // ....e........................................................................ - sub v0.8H, v31.8H, v3.8H // ....................................................*........................ - // gap // ............................................................................. - trn2 v23.4S, v22.4S, v21.4S // .......e..................................................................... - sqdmulh v14.8H, v15.8H, v7.H[1] // ............................................................*................ - // gap // ............................................................................. - trn1 v20.4S, v22.4S, v21.4S // ......e...................................................................... - // gap // ............................................................................. - // gap // ............................................................................. - sqrdmulh v8.8H, v0.8H, v4.H[5] // .......................................................*..................... - srshr v2.8H, v2.8H, #11 // ..........................................................*.................. - // gap // ............................................................................. - trn2 v21.2D, v19.2D, v23.2D // .........e................................................................... - // gap // ............................................................................. - // gap // ............................................................................. - trn2 v16.2D, v13.2D, v20.2D // ........e.................................................................... - mul v0.8H, v0.8H, v4.H[4] // ......................................................*...................... - // gap // ............................................................................. - trn1 v1.2D, v19.2D, v23.2D // ...........e................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - mls v24.8H, v2.8H, v7.H[0] // ...........................................................*................. - trn1 v11.2D, v13.2D, v20.2D // ..........e.................................................................. - ldr q23, [x4, #-32] // ................e............................................................ - sub v2.8H, v16.8H, v21.8H // .......................e..................................................... - // gap // ............................................................................. - // gap // ............................................................................. - add v16.8H, v16.8H, v21.8H // ........................e.................................................... - mls v0.8H, v8.8H, v7.H[0] // ........................................................*.................... - // gap // ............................................................................. - sub v21.8H, v11.8H, v1.8H // ..................e.......................................................... - // gap // ............................................................................. - // gap // ............................................................................. - mul v26.8H, v2.8H, v23.8H // .........................e................................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - sqrdmulh v23.8H, v21.8H, v30.8H // .....................e....................................................... - // gap // ............................................................................. - ldr q25, [x4, #-64] // ..............e.............................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - sqrdmulh v2.8H, v2.8H, v6.8H // ..........................e.................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - mul v3.8H, v21.8H, v25.8H // ....................e........................................................ - add v21.8H, v11.8H, v1.8H // ...................e......................................................... - // gap // ............................................................................. - sub v11.8H, v17.8H, v0.8H // ....................................................................*........ - // gap // ............................................................................. - // gap // ............................................................................. - mls v3.8H, v23.8H, v7.H[0] // ......................e...................................................... - add v0.8H, v17.8H, v0.8H // .....................................................................*....... - // gap // ............................................................................. - sub v6.8H, v21.8H, v16.8H // ............................e................................................ - // gap // ............................................................................. - // gap // ............................................................................. - add v1.8H, v21.8H, v16.8H // .............................e............................................... - mls v26.8H, v2.8H, v7.H[0] // ...........................e................................................. - // gap // ............................................................................. - str q0, [x1, #16] // ..........................................................................*.. - srshr v0.8H, v14.8H, #11 // .............................................................*............... - // gap // ............................................................................. - sqrdmulh v23.8H, v6.8H, v10.8H // ...............................e............................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - mls v15.8H, v0.8H, v7.H[0] // ..............................................................*.............. - // gap // ............................................................................. - // gap // ............................................................................. - sub v16.8H, v3.8H, v26.8H // .................................e........................................... - // gap // ............................................................................. - // gap // ............................................................................. - add v0.8H, v3.8H, v26.8H // ..................................e.......................................... - // gap // ............................................................................. - mul v21.8H, v6.8H, v9.8H // ..............................e.............................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - sqrdmulh v2.8H, v16.8H, v10.8H // ....................................e........................................ - // gap // ............................................................................. - // gap // ............................................................................. - sub v14.8H, v24.8H, v15.8H // ...............................................................*............. - // gap // ............................................................................. - // gap // ............................................................................. - add v8.8H, v24.8H, v15.8H // ................................................................*............ - mls v21.8H, v23.8H, v7.H[0] // ................................e............................................ - // gap // ............................................................................. - trn1 v20.4S, v1.4S, v0.4S // ......................................e...................................... - // gap // ............................................................................. - // gap // ............................................................................. - mul v16.8H, v16.8H, v9.8H // ...................................e......................................... - // gap // ............................................................................. - // gap // ............................................................................. - str q8, [x1], #(64) // .........................................................................*... - // gap // ............................................................................. - // gap // ............................................................................. - mls v16.8H, v2.8H, v7.H[0] // .....................................e....................................... - trn2 v2.4S, v1.4S, v0.4S // .......................................e..................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - sqrdmulh v0.8H, v11.8H, v4.H[1] // .......................................................................*..... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - sqrdmulh v3.8H, v14.8H, v4.H[1] // ..................................................................*.......... - trn1 v23.4S, v21.4S, v16.4S // ........................................e.................................... - // gap // ............................................................................. - // gap // ............................................................................. - mul v1.8H, v11.8H, v4.H[0] // ......................................................................*...... - trn2 v8.4S, v21.4S, v16.4S // .........................................e................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - mls v1.8H, v0.8H, v7.H[0] // ........................................................................*.... - // gap // ............................................................................. - trn1 v16.2D, v20.2D, v23.2D // ............................................e................................ - trn1 v0.2D, v2.2D, v8.2D // .............................................e............................... - // gap // ............................................................................. - // gap // ............................................................................. - mul v9.8H, v14.8H, v4.H[0] // .................................................................*........... - ldr q4, [x3], #16 // ..............................................e.............................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - mls v9.8H, v3.8H, v7.H[0] // ...................................................................*......... - // gap // ............................................................................. - sub v5.8H, v16.8H, v0.8H // ...............................................e............................. - trn2 v3.2D, v2.2D, v8.2D // ...........................................e................................. - str q1, [x1, #-16] // ............................................................................* - // gap // ............................................................................. - add v24.8H, v16.8H, v0.8H // ................................................e............................ - // gap // ............................................................................. - // gap // ............................................................................. - sqrdmulh v26.8H, v5.8H, v4.H[3] // ..................................................e.......................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - str q9, [x1, #-32] // ...........................................................................*. - mul v17.8H, v5.8H, v4.H[2] // .................................................e........................... - // gap // ............................................................................. + sqrdmulh v30.8H, v1.8H, v6.H[5] // .......................................................*........................... + ldr q5, [x1, #96] // ..e................................................................................ + ldr q22, [x1, #112] // ...e............................................................................... + srshr v28.8H, v12.8H, #11 // ..........................................................*........................ + ldr q12, [x1, #80] // .e................................................................................. + ldr q17, [x1, #64] // e.................................................................................. + ldr q25, [x4], #(6*16) // ............e...................................................................... + mul v26.8H, v23.8H, v6.H[2] // .................................................*................................. + ldr q16, [x4, #-80] // .............e..................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn2 v19.4S, v5.4S, v22.4S // .......e........................................................................... + mls v26.8H, v24.8H, v7.H[0] // ...................................................*............................... + // gap // ................................................................................... + trn2 v23.4S, v17.4S, v12.4S // .....e............................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + mls v2.8H, v30.8H, v7.H[0] // ........................................................*.......................... + trn1 v8.4S, v5.4S, v22.4S // ......e............................................................................ + // gap // ................................................................................... + trn1 v22.4S, v17.4S, v12.4S // ....e.............................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v1.2D, v23.2D, v19.2D // ...........e....................................................................... + mls v11.8H, v3.8H, v7.H[0] // ..............................................................*.................... + // gap // ................................................................................... + trn2 v15.2D, v23.2D, v19.2D // .........e......................................................................... + ldr q23, [x4, #-64] // ..............e.................................................................... + // gap // ................................................................................... + trn1 v27.2D, v22.2D, v8.2D // ..........e........................................................................ + sqdmulh v24.8H, v26.8H, v7.H[1] // ...............................................................*................... + ldr q29, [x4, #-48] // ...............e................................................................... + ldr q14, [x4, #-32] // ................e.................................................................. + trn2 v20.2D, v22.2D, v8.2D // ........e.......................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqdmulh v19.8H, v2.8H, v7.H[1] // ..................................................................*................ + sub v3.8H, v27.8H, v1.8H // ..................e................................................................ + // gap // ................................................................................... + // gap // ................................................................................... + sub v21.8H, v20.8H, v15.8H // .......................e........................................................... + mls v0.8H, v28.8H, v7.H[0] // ...........................................................*....................... + // gap // ................................................................................... + srshr v22.8H, v24.8H, #11 // ................................................................*.................. + // gap // ................................................................................... + // gap // ................................................................................... + add v15.8H, v20.8H, v15.8H // ........................e.......................................................... + mul v12.8H, v3.8H, v23.8H // ....................e.............................................................. + // gap // ................................................................................... + srshr v19.8H, v19.8H, #11 // ...................................................................*............... + // gap // ................................................................................... + // gap // ................................................................................... + add v5.8H, v27.8H, v1.8H // ...................e............................................................... + sqrdmulh v23.8H, v3.8H, v29.8H // .....................e............................................................. + ldr q28, [x4, #-16] // .................e................................................................. + add v10.8H, v0.8H, v11.8H // ......................................................................*............ + // gap // ................................................................................... + // gap // ................................................................................... + mls v2.8H, v19.8H, v7.H[0] // ....................................................................*.............. + // gap // ................................................................................... + // gap // ................................................................................... + add v29.8H, v5.8H, v15.8H // .............................e..................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q10, [x1], #(64) // ...............................................................................*... + sqrdmulh v18.8H, v21.8H, v28.8H // ..........................e........................................................ + sub v9.8H, v5.8H, v15.8H // ............................e...................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v12.8H, v23.8H, v7.H[0] // ......................e............................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v27.8H, v21.8H, v14.8H // .........................e......................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v27.8H, v18.8H, v7.H[0] // ...........................e....................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v1.8H, v9.8H, v16.8H // ...............................e................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v26.8H, v22.8H, v7.H[0] // .................................................................*................. + // gap // ................................................................................... + // gap // ................................................................................... + sub v28.8H, v12.8H, v27.8H // .................................e................................................. + // gap // ................................................................................... + // gap // ................................................................................... + mul v31.8H, v9.8H, v25.8H // ..............................e.................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v19.8H, v28.8H, v16.8H // ....................................e.............................................. + // gap // ................................................................................... + // gap // ................................................................................... + add v23.8H, v26.8H, v2.8H // ...........................................................................*....... + // gap // ................................................................................... + // gap // ................................................................................... + mls v31.8H, v1.8H, v7.H[0] // ................................e.................................................. + sub v22.8H, v26.8H, v2.8H // ..........................................................................*........ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q23, [x1, #-48] // ................................................................................*.. + mul v1.8H, v28.8H, v25.8H // ...................................e............................................... + sub v28.8H, v0.8H, v11.8H // .....................................................................*............. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v1.8H, v19.8H, v7.H[0] // .....................................e............................................. + add v19.8H, v12.8H, v27.8H // ..................................e................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v23.8H, v28.8H, v6.H[1] // ........................................................................*.......... + // gap // ................................................................................... + // gap // ................................................................................... + trn2 v3.4S, v29.4S, v19.4S // .......................................e........................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v10.4S, v29.4S, v19.4S // ......................................e............................................ + mul v4.8H, v28.8H, v6.H[0] // .......................................................................*........... + // gap // ................................................................................... + trn2 v20.4S, v31.4S, v1.4S // .........................................e......................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v31.4S, v31.4S, v1.4S // ........................................e.......................................... + // gap // ................................................................................... + sqrdmulh v19.8H, v22.8H, v6.H[1] // .............................................................................*..... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v4.8H, v23.8H, v7.H[0] // .........................................................................*......... + trn2 v30.2D, v3.2D, v20.2D // ...........................................e....................................... + // gap // ................................................................................... + trn2 v28.2D, v10.2D, v31.2D // ..........................................e........................................ + // gap // ................................................................................... + // gap // ................................................................................... + mul v22.8H, v22.8H, v6.H[0] // ............................................................................*...... + ldr q6, [x3], #16 // ..............................................e.................................... + trn1 v17.2D, v3.2D, v20.2D // .............................................e..................................... + trn1 v5.2D, v10.2D, v31.2D // ............................................e...................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v11.8H, v28.8H, v30.8H // .....................................................e............................. + mls v22.8H, v19.8H, v7.H[0] // ..............................................................................*.... + // gap // ................................................................................... + str q4, [x1, #-32] // .................................................................................*. + sub v1.8H, v28.8H, v30.8H // ....................................................e.............................. + // gap // ................................................................................... + sub v23.8H, v5.8H, v17.8H // ...............................................e................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v0.8H, v5.8H, v17.8H // ................................................e.................................. + sqdmulh v8.8H, v11.8H, v7.H[1] // ............................................................e...................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q22, [x1, #-16] // ..................................................................................* + sqrdmulh v24.8H, v23.8H, v6.H[3] // ..................................................e................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqdmulh v12.8H, v0.8H, v7.H[1] // .........................................................e......................... + // gap // ................................................................................... + // gap // ................................................................................... + srshr v3.8H, v8.8H, #11 // .............................................................e..................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v2.8H, v1.8H, v6.H[4] // ......................................................e............................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... // original source code - // ldr q8, [x1, #(16*0)] // .e..........................................................................|.e......................................................................... - // ldr q9, [x1, #(16*1)] // e...........................................................................|e.......................................................................... - // ldr q10, [x1, #(16*2)] // ......e.....................................................................|......e.................................................................... - // ldr q11, [x1, #(16*3)] // .....e......................................................................|.....e..................................................................... - // trn1 v25.4s, v8.4s, v9.4s // ............e...............................................................|............e.............................................................. - // trn2 v26.4s, v8.4s, v9.4s // ..........e.................................................................|..........e................................................................ - // trn1 v27.4s, v10.4s, v11.4s // ................e...........................................................|................e.......................................................... - // trn2 v28.4s, v10.4s, v11.4s // ..............e.............................................................|..............e............................................................ - // trn2 v10.2d, v25.2d, v27.2d // ....................e.......................................................|....................e...................................................... - // trn2 v11.2d, v26.2d, v28.2d // ...................e........................................................|...................e....................................................... - // trn1 v8.2d, v25.2d, v27.2d // ........................e...................................................|........................e.................................................. - // trn1 v9.2d, v26.2d, v28.2d // ......................e.....................................................|......................e.................................................... - // ldr q0, [x4], #(6*16) // .......e....................................................................|.......e................................................................... - // ldr q4, [x4, #(-6*16 + 1*16)] // ...........e................................................................|...........e............................................................... - // ldr q1, [x4, #(-6*16 + 2*16)] // ................................e...........................................|................................e.......................................... - // ldr q5, [x4, #(-6*16 + 3*16)] // ..e.........................................................................|..e........................................................................ - // ldr q2, [x4, #(-6*16 + 4*16)] // .........................e..................................................|.........................e................................................. - // ldr q6, [x4, #(-6*16 + 5*16)] // ....e.......................................................................|....e...................................................................... - // sub v24.8h, v8.8h, v9.8h // .............................e..............................................|.............................e............................................. - // add v8.8h, v8.8h, v9.8h // ...................................e........................................|...................................e....................................... - // mul v9.8h, v24.8h, v1.8h // ..................................e.........................................|..................................e........................................ - // sqrdmulh v24.8h, v24.8h, v5.8h // ...............................e............................................|...............................e........................................... - // mls v9.8h, v24.8h, v7.h[0] // .....................................e......................................|.....................................e..................................... - // sub v24.8h, v10.8h, v11.8h // ..........................e.................................................|..........................e................................................ - // add v10.8h, v10.8h, v11.8h // ...........................e................................................|...........................e............................................... - // mul v11.8h, v24.8h, v2.8h // ..............................e.............................................|..............................e............................................ - // sqrdmulh v24.8h, v24.8h, v6.8h // .................................e..........................................|.................................e......................................... - // mls v11.8h, v24.8h, v7.h[0] // .........................................e..................................|.........................................e................................. - // sub v24.8h, v8.8h, v10.8h // .......................................e....................................|.......................................e................................... - // add v8.8h, v8.8h, v10.8h // ........................................e...................................|........................................e.................................. - // mul v10.8h, v24.8h, v0.8h // ................................................e...........................|................................................e.......................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ............................................e...............................|............................................e.............................. - // mls v10.8h, v24.8h, v7.h[0] // ....................................................e.......................|....................................................e...................... - // sub v24.8h, v9.8h, v11.8h // ..............................................e.............................|..............................................e............................ - // add v9.8h, v9.8h, v11.8h // ...............................................e............................|...............................................e........................... - // mul v11.8h, v24.8h, v0.8h // ......................................................e.....................|......................................................e.................... - // sqrdmulh v24.8h, v24.8h, v4.8h // .................................................e..........................|.................................................e......................... - // mls v11.8h, v24.8h, v7.h[0] // ........................................................e...................|........................................................e.................. - // trn1 v25.4s, v8.4s, v9.4s // .....................................................e......................|.....................................................e..................... - // trn2 v26.4s, v8.4s, v9.4s // .........................................................e..................|.........................................................e................. - // trn1 v27.4s, v10.4s, v11.4s // ............................................................e...............|............................................................e.............. - // trn2 v28.4s, v10.4s, v11.4s // ..............................................................e.............|..............................................................e............ - // trn2 v10.2d, v25.2d, v27.2d // ............................................................................*........................................................................... - // trn2 v11.2d, v26.2d, v28.2d // ......................................................................e.....|......................................................................e.... - // trn1 v8.2d, v25.2d, v27.2d // ................................................................e...........|................................................................e.......... - // trn1 v9.2d, v26.2d, v28.2d // .................................................................e..........|.................................................................e......... - // ldr q0, [x3], #16 // ...................................................................e........|...................................................................e....... - // sub v24.8h, v8.8h, v9.8h // .....................................................................e......|.....................................................................e..... - // add v8.8h, v8.8h, v9.8h // ........................................................................e...|........................................................................e.. - // mul v9.8h, v24.8h, v0.h[2] // ...........................................................................e|........................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // .........................................................................e..|.........................................................................e. - // mls v9.8h, v24.8h, v7.h[0] // ........*...................................................................|........*.................................................................. - // sub v24.8h, v10.8h, v11.8h // .............*..............................................................|.............*............................................................. - // add v10.8h, v10.8h, v11.8h // .........*..................................................................|.........*................................................................. - // mul v11.8h, v24.8h, v0.h[4] // .....................*......................................................|.....................*..................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // .................*..........................................................|.................*......................................................... - // mls v11.8h, v24.8h, v7.h[0] // ............................*...............................................|............................*.............................................. - // sqdmulh v25.8h, v8.8h, v7.h[1] // ...*........................................................................|...*....................................................................... - // srshr v25.8h, v25.8h, #11 // ..................*.........................................................|..................*........................................................ - // mls v8.8h, v25.8h, v7.h[0] // .......................*....................................................|.......................*................................................... - // sqdmulh v25.8h, v10.8h, v7.h[1] // ...............*............................................................|...............*........................................................... - // srshr v25.8h, v25.8h, #11 // ...........................................*................................|...........................................*............................... - // mls v10.8h, v25.8h, v7.h[0] // .............................................*..............................|.............................................*............................. - // sub v24.8h, v8.8h, v10.8h // ..................................................*.........................|..................................................*........................ - // add v8.8h, v8.8h, v10.8h // ...................................................*........................|...................................................*....................... - // mul v10.8h, v24.8h, v0.h[0] // ..................................................................*.........|..................................................................*........ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........................................................*................|...........................................................*............... - // mls v10.8h, v24.8h, v7.h[0] // ....................................................................*.......|....................................................................*...... - // sub v24.8h, v9.8h, v11.8h // ....................................*.......................................|....................................*...................................... - // add v9.8h, v9.8h, v11.8h // ......................................*.....................................|......................................*.................................... - // mul v11.8h, v24.8h, v0.h[0] // .............................................................*..............|.............................................................*............. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........................................................*.................|..........................................................*................ - // mls v11.8h, v24.8h, v7.h[0] // ...............................................................*............|...............................................................*........... - // str q8, [x1], #(64) // .......................................................*....................|.......................................................*................... - // str q9, [x1, #(-64 + 16*1)] // ..........................................*.................................|..........................................*................................ - // str q10, [x1, #(-64 + 16*2)] // ..........................................................................*.|..........................................................................* - // str q11, [x1, #(-64 + 16*3)] // .......................................................................*....|.......................................................................*... + // ldr q8, [x1, #(16*0)] // ....e.............................................................................|....e......................................................................... + // ldr q9, [x1, #(16*1)] // ...e..............................................................................|...e.......................................................................... + // ldr q10, [x1, #(16*2)] // e.................................................................................|e............................................................................. + // ldr q11, [x1, #(16*3)] // .e................................................................................|.e............................................................................ + // trn1 v25.4s, v8.4s, v9.4s // .............e....................................................................|.............e................................................................ + // trn2 v26.4s, v8.4s, v9.4s // ..........e.......................................................................|..........e................................................................... + // trn1 v27.4s, v10.4s, v11.4s // ............e.....................................................................|............e................................................................. + // trn2 v28.4s, v10.4s, v11.4s // ........e.........................................................................|........e..................................................................... + // trn2 v10.2d, v25.2d, v27.2d // ......................e...........................................................|......................e....................................................... + // trn2 v11.2d, v26.2d, v28.2d // ................e.................................................................|................e............................................................. + // trn1 v8.2d, v25.2d, v27.2d // ..................e...............................................................|..................e........................................................... + // trn1 v9.2d, v26.2d, v28.2d // ..............e...................................................................|..............e............................................................... + // ldr q0, [x4], #(6*16) // .....e............................................................................|.....e........................................................................ + // ldr q4, [x4, #(-6*16 + 1*16)] // .......e..........................................................................|.......e...................................................................... + // ldr q1, [x4, #(-6*16 + 2*16)] // .................e................................................................|.................e............................................................ + // ldr q5, [x4, #(-6*16 + 3*16)] // ....................e.............................................................|....................e......................................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // .....................e............................................................|.....................e........................................................ + // ldr q6, [x4, #(-6*16 + 5*16)] // .................................e................................................|.................................e............................................ + // sub v24.8h, v8.8h, v9.8h // ........................e.........................................................|........................e..................................................... + // add v8.8h, v8.8h, v9.8h // ...............................e..................................................|...............................e.............................................. + // mul v9.8h, v24.8h, v1.8h // .............................e....................................................|.............................e................................................ + // sqrdmulh v24.8h, v24.8h, v5.8h // ................................e.................................................|................................e............................................. + // mls v9.8h, v24.8h, v7.h[0] // ........................................e.........................................|........................................e..................................... + // sub v24.8h, v10.8h, v11.8h // .........................e........................................................|.........................e.................................................... + // add v10.8h, v10.8h, v11.8h // ............................e.....................................................|............................e................................................. + // mul v11.8h, v24.8h, v2.8h // .........................................e........................................|.........................................e.................................... + // sqrdmulh v24.8h, v24.8h, v6.8h // ......................................e...........................................|......................................e....................................... + // mls v11.8h, v24.8h, v7.h[0] // ..........................................e.......................................|..........................................e................................... + // sub v24.8h, v8.8h, v10.8h // .......................................e..........................................|.......................................e...................................... + // add v8.8h, v8.8h, v10.8h // ....................................e.............................................|....................................e......................................... + // mul v10.8h, v24.8h, v0.8h // ..............................................e...................................|..............................................e............................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ...........................................e......................................|...........................................e.................................. + // mls v10.8h, v24.8h, v7.h[0] // .................................................e................................|.................................................e............................ + // sub v24.8h, v9.8h, v11.8h // .............................................e....................................|.............................................e................................ + // add v9.8h, v9.8h, v11.8h // .......................................................e..........................|.......................................................e...................... + // mul v11.8h, v24.8h, v0.8h // ....................................................e.............................|....................................................e......................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ...............................................e..................................|...............................................e.............................. + // mls v11.8h, v24.8h, v7.h[0] // ......................................................e...........................|......................................................e....................... + // trn1 v25.4s, v8.4s, v9.4s // ..........................................................e.......................|..........................................................e................... + // trn2 v26.4s, v8.4s, v9.4s // .........................................................e........................|.........................................................e.................... + // trn1 v27.4s, v10.4s, v11.4s // .............................................................e....................|.............................................................e................ + // trn2 v28.4s, v10.4s, v11.4s // ............................................................e.....................|............................................................e................. + // trn2 v10.2d, v25.2d, v27.2d // .................................................................e................|.................................................................e............ + // trn2 v11.2d, v26.2d, v28.2d // ................................................................e.................|................................................................e............. + // trn1 v8.2d, v25.2d, v27.2d // .....................................................................e............|.....................................................................e........ + // trn1 v9.2d, v26.2d, v28.2d // ....................................................................e.............|....................................................................e......... + // ldr q0, [x3], #16 // ...................................................................e..............|...................................................................e.......... + // sub v24.8h, v8.8h, v9.8h // ..........................................................................e.......|..........................................................................e... + // add v8.8h, v8.8h, v9.8h // ...........................................................................e......|...........................................................................e.. + // mul v9.8h, v24.8h, v0.h[2] // ......*...........................................................................|......*....................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..............................................................................e...|.............................................................................. + // mls v9.8h, v24.8h, v7.h[0] // .........*........................................................................|.........*.................................................................... + // sub v24.8h, v10.8h, v11.8h // .........................................................................e........|.........................................................................e.... + // add v10.8h, v10.8h, v11.8h // ......................................................................e...........|......................................................................e....... + // mul v11.8h, v24.8h, v0.h[4] // .................................................................................e|.............................................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..................................................................................*.............................................................................. + // mls v11.8h, v24.8h, v7.h[0] // ...........*......................................................................|...........*.................................................................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...............................................................................e..|.............................................................................. + // srshr v25.8h, v25.8h, #11 // ..*...............................................................................|..*........................................................................... + // mls v8.8h, v25.8h, v7.h[0] // ..........................*.......................................................|..........................*................................................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ............................................................................e.....|............................................................................e. + // srshr v25.8h, v25.8h, #11 // ................................................................................e.|.............................................................................. + // mls v10.8h, v25.8h, v7.h[0] // ...............*..................................................................|...............*.............................................................. + // sqdmulh v25.8h, v9.8h, v7.h[1] // ...................*..............................................................|...................*.......................................................... + // srshr v25.8h, v25.8h, #11 // ...........................*......................................................|...........................*.................................................. + // mls v9.8h, v25.8h, v7.h[0] // ............................................*.....................................|............................................*................................. + // sqdmulh v25.8h, v11.8h, v7.h[1] // .......................*..........................................................|.......................*...................................................... + // srshr v25.8h, v25.8h, #11 // ..............................*...................................................|..............................*............................................... + // mls v11.8h, v25.8h, v7.h[0] // ...................................*..............................................|...................................*.......................................... + // sub v24.8h, v8.8h, v10.8h // .....................................................*............................|.....................................................*........................ + // add v8.8h, v8.8h, v10.8h // ..................................*...............................................|..................................*........................................... + // mul v10.8h, v24.8h, v0.h[0] // ...........................................................*......................|...........................................................*.................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................................................*.........................|........................................................*..................... + // mls v10.8h, v24.8h, v7.h[0] // ...............................................................*..................|...............................................................*.............. + // sub v24.8h, v9.8h, v11.8h // ..................................................*...............................|..................................................*........................... + // add v9.8h, v9.8h, v11.8h // ................................................*.................................|................................................*............................. + // mul v11.8h, v24.8h, v0.h[0] // ..................................................................*...............|..................................................................*........... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................................*...................|..............................................................*............... + // mls v11.8h, v24.8h, v7.h[0] // .......................................................................*..........|.......................................................................*...... + // str q8, [x1], #(64) // .....................................*............................................|.....................................*........................................ + // str q9, [x1, #(-64 + 16*1)] // ...................................................*..............................|...................................................*.......................... + // str q10, [x1, #(-64 + 16*2)] // ........................................................................*.........|........................................................................*..... + // str q11, [x1, #(-64 + 16*3)] // .............................................................................*....|.............................................................................* sub count, count, #1 cbnz count, layer4567_start - mls v17.8H, v26.8H, v7.H[0] // ..*........................ - trn2 v8.2D, v20.2D, v23.2D // *.......................... + sqrdmulh v8.8H, v1.8H, v6.H[5] // *.......................... + srshr v22.8H, v12.8H, #11 // .*......................... // gap // ........................... // gap // ........................... // gap // ........................... // gap // ........................... - sqdmulh v26.8H, v24.8H, v7.H[1] // .*......................... + mul v4.8H, v23.8H, v6.H[2] // ..*........................ // gap // ........................... // gap // ........................... - sub v13.8H, v8.8H, v3.8H // ....*...................... // gap // ........................... // gap // ........................... - add v19.8H, v8.8H, v3.8H // ...*....................... // gap // ........................... + mls v4.8H, v24.8H, v7.H[0] // ...*....................... // gap // ........................... // gap // ........................... // gap // ........................... // gap // ........................... - sqrdmulh v31.8H, v13.8H, v4.H[5] // ......*.................... // gap // ........................... + mls v2.8H, v8.8H, v7.H[0] // ....*...................... // gap // ........................... - srshr v9.8H, v26.8H, #11 // .......*................... // gap // ........................... // gap // ........................... - sqdmulh v26.8H, v19.8H, v7.H[1] // .....*..................... // gap // ........................... // gap // ........................... // gap // ........................... // gap // ........................... // gap // ........................... - mul v6.8H, v13.8H, v4.H[4] // ........*.................. + sqdmulh v8.8H, v4.8H, v7.H[1] // ......*.................... // gap // ........................... // gap // ........................... // gap // ........................... // gap // ........................... // gap // ........................... - mls v6.8H, v31.8H, v7.H[0] // ..........*................ + sqdmulh v17.8H, v2.8H, v7.H[1] // .......*................... // gap // ........................... // gap // ........................... - srshr v0.8H, v26.8H, #11 // ..............*............ // gap // ........................... // gap // ........................... - mls v24.8H, v9.8H, v7.H[0] // .........*................. // gap // ........................... + mls v0.8H, v22.8H, v7.H[0] // ........*.................. // gap // ........................... // gap // ........................... + srshr v21.8H, v8.8H, #11 // .........*................. // gap // ........................... // gap // ........................... - mls v19.8H, v0.8H, v7.H[0] // ...............*........... + mls v11.8H, v3.8H, v7.H[0] // .....*..................... // gap // ........................... // gap // ........................... - sub v9.8H, v17.8H, v6.8H // ...........*............... + srshr v8.8H, v17.8H, #11 // ..........*................ // gap // ........................... // gap // ........................... - add v18.8H, v17.8H, v6.8H // ............*.............. + mls v4.8H, v21.8H, v7.H[0] // ..............*............ // gap // ........................... // gap // ........................... // gap // ........................... // gap // ........................... // gap // ........................... - mul v6.8H, v9.8H, v4.H[0] // .....................*..... + mls v2.8H, v8.8H, v7.H[0] // ............*.............. // gap // ........................... // gap // ........................... - str q18, [x1, #16] // .............*............. - sub v12.8H, v24.8H, v19.8H // ................*.......... + sub v8.8H, v0.8H, v11.8H // ..................*........ // gap // ........................... - sqrdmulh v25.8H, v9.8H, v4.H[1] // ...................*....... - add v14.8H, v24.8H, v19.8H // .................*......... // gap // ........................... // gap // ........................... // gap // ........................... // gap // ........................... - sqrdmulh v8.8H, v12.8H, v4.H[1] // ....................*...... // gap // ........................... // gap // ........................... - str q14, [x1], #(64) // ..................*........ // gap // ........................... + sqrdmulh v1.8H, v8.8H, v6.H[1] // ...................*....... // gap // ........................... - mul v9.8H, v12.8H, v4.H[0] // .......................*... // gap // ........................... + sub v25.8H, v4.8H, v2.8H // ................*.......... // gap // ........................... // gap // ........................... + mul v13.8H, v8.8H, v6.H[0] // ....................*...... // gap // ........................... // gap // ........................... - mls v6.8H, v25.8H, v7.H[0] // ......................*.... // gap // ........................... + add v8.8H, v0.8H, v11.8H // ...........*............... // gap // ........................... + sqrdmulh v21.8H, v25.8H, v6.H[1] // .....................*..... // gap // ........................... // gap // ........................... // gap // ........................... - mls v9.8H, v8.8H, v7.H[0] // ........................*.. // gap // ........................... // gap // ........................... + str q8, [x1], #(64) // .............*............. + add v12.8H, v4.8H, v2.8H // ...............*........... + mul v8.8H, v25.8H, v6.H[0] // .......................*... // gap // ........................... // gap // ........................... // gap // ........................... + mls v13.8H, v1.8H, v7.H[0] // ......................*.... // gap // ........................... // gap // ........................... + str q12, [x1, #-48] // .................*......... // gap // ........................... - str q6, [x1, #-16] // .........................*. // gap // ........................... + mls v8.8H, v21.8H, v7.H[0] // ........................*.. // gap // ........................... // gap // ........................... // gap // ........................... // gap // ........................... - str q9, [x1, #-32] // ..........................* + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + str q13, [x1, #-32] // .........................*. + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + str q8, [x1, #-16] // ..........................* // gap // ........................... // gap // ........................... // original source code - // trn2 v31.2D, v20.2D, v23.2D // .*......................... - // sqdmulh v2.8H, v24.8H, v7.H[1] // ..*........................ - // mls v17.8H, v26.8H, v7.H[0] // *.......................... - // add v15.8H, v31.8H, v3.8H // ....*...................... - // sub v0.8H, v31.8H, v3.8H // ...*....................... - // sqdmulh v14.8H, v15.8H, v7.H[1] // .......*................... - // sqrdmulh v8.8H, v0.8H, v4.H[5] // .....*..................... - // srshr v2.8H, v2.8H, #11 // ......*.................... - // mul v0.8H, v0.8H, v4.H[4] // ........*.................. - // mls v24.8H, v2.8H, v7.H[0] // ...........*............... - // mls v0.8H, v8.8H, v7.H[0] // .........*................. - // sub v11.8H, v17.8H, v0.8H // .............*............. - // add v0.8H, v17.8H, v0.8H // ..............*............ - // str q0, [x1, #16] // ................*.......... - // srshr v0.8H, v14.8H, #11 // ..........*................ - // mls v15.8H, v0.8H, v7.H[0] // ............*.............. - // sub v14.8H, v24.8H, v15.8H // .................*......... - // add v8.8H, v24.8H, v15.8H // ...................*....... - // str q8, [x1], #(64) // .....................*..... - // sqrdmulh v0.8H, v11.8H, v4.H[1] // ..................*........ - // sqrdmulh v3.8H, v14.8H, v4.H[1] // ....................*...... - // mul v1.8H, v11.8H, v4.H[0] // ...............*........... - // mls v1.8H, v0.8H, v7.H[0] // .......................*... - // mul v9.8H, v14.8H, v4.H[0] // ......................*.... - // mls v9.8H, v3.8H, v7.H[0] // ........................*.. - // str q1, [x1, #-16] // .........................*. - // str q9, [x1, #-32] // ..........................* + // sqrdmulh v30.8H, v1.8H, v6.H[5] // *.......................... + // srshr v28.8H, v12.8H, #11 // .*......................... + // mul v26.8H, v23.8H, v6.H[2] // ..*........................ + // mls v26.8H, v24.8H, v7.H[0] // ...*....................... + // mls v2.8H, v30.8H, v7.H[0] // ....*...................... + // mls v11.8H, v3.8H, v7.H[0] // .........*................. + // sqdmulh v24.8H, v26.8H, v7.H[1] // .....*..................... + // sqdmulh v19.8H, v2.8H, v7.H[1] // ......*.................... + // mls v0.8H, v28.8H, v7.H[0] // .......*................... + // srshr v22.8H, v24.8H, #11 // ........*.................. + // srshr v19.8H, v19.8H, #11 // ..........*................ + // add v10.8H, v0.8H, v11.8H // .................*......... + // mls v2.8H, v19.8H, v7.H[0] // ............*.............. + // str q10, [x1], #(64) // ...................*....... + // mls v26.8H, v22.8H, v7.H[0] // ...........*............... + // add v23.8H, v26.8H, v2.8H // ....................*...... + // sub v22.8H, v26.8H, v2.8H // ...............*........... + // str q23, [x1, #-48] // .......................*... + // sub v28.8H, v0.8H, v11.8H // .............*............. + // sqrdmulh v23.8H, v28.8H, v6.H[1] // ..............*............ + // mul v4.8H, v28.8H, v6.H[0] // ................*.......... + // sqrdmulh v19.8H, v22.8H, v6.H[1] // ..................*........ + // mls v4.8H, v23.8H, v7.H[0] // ......................*.... + // mul v22.8H, v22.8H, v6.H[0] // .....................*..... + // mls v22.8H, v19.8H, v7.H[0] // ........................*.. + // str q4, [x1, #-32] // .........................*. + // str q22, [x1, #-16] // ..........................* // --------------------------------------------------------------------- @@ -985,860 +1039,800 @@ layer4567_start: .p2align 2 - ldr q9, [x0, #448] // ...*.......... - ldr q14, [x0, #384] // .*............ - // gap // .............. - // gap // .............. - // gap // .............. - // gap // .............. - // gap // .............. - ldr q8, [x0, #64] // *............. - // gap // .............. - ldr q12, [x0, #0] // ......*....... - // gap // .............. - // gap // .............. - sub v15.8H, v14.8H, v9.8H // ....*......... - // gap // .............. - // gap // .............. - add v20.8H, v14.8H, v9.8H // .........*.... - ldr q25, [x0, #192] // .....*........ - // gap // .............. - // gap // .............. - // gap // .............. - // gap // .............. - mul v11.8H, v15.8H, v1.H[4] // ...........*.. - add v3.8H, v12.8H, v8.8H // .............* - // gap // .............. - // gap // .............. - sub v8.8H, v12.8H, v8.8H // ........*..... - // gap // .............. - sqrdmulh v14.8H, v15.8H, v1.H[5] // .......*...... - // gap // .............. - // gap // .............. - ldr q10, [x0, #256] // ..*........... - // gap // .............. - // gap // .............. - ldr q17, [x0, #128] // ..........*... - sqrdmulh v19.8H, v8.8H, v0.H[7] // ............*. - // gap // .............. + ldr q23, [x0, #384] // *................. + ldr q19, [x0, #448] // .*................ + // gap // .................. + ldr q2, [x0, #320] // ..*............... + // gap // .................. + // gap // .................. + ldr q5, [x0, #256] // ....*............. + // gap // .................. + // gap // .................. + ldr q4, [x0, #128] // ......*........... + // gap // .................. + // gap // .................. + add v24.8H, v23.8H, v19.8H // ............*..... + sub v23.8H, v23.8H, v19.8H // .....*............ + ldr q19, [x0, #192] // .......*.......... + ldr q20, [x0, #0] // ..........*....... + // gap // .................. + // gap // .................. + ldr q26, [x0, #64] // ...*.............. + add v17.8H, v5.8H, v2.8H // ........*......... + // gap // .................. + sqrdmulh v13.8H, v23.8H, v1.H[5] // ...........*...... + // gap // .................. + // gap // .................. + sub v9.8H, v4.8H, v19.8H // .........*........ + // gap // .................. + // gap // .................. + add v27.8H, v4.8H, v19.8H // ..............*... + mul v11.8H, v23.8H, v1.H[4] // .............*.... + // gap // .................. + add v25.8H, v20.8H, v26.8H // ................*. + // gap // .................. + // gap // .................. + sub v23.8H, v20.8H, v26.8H // .................* + sqrdmulh v16.8H, v9.8H, v1.H[1] // ...............*.. + // gap // .................. // original source code - // ldr q18, [x0, #64] // ..*........... - // ldr q27, [x0, #384] // .*............ - // ldr q10, [x0, #256] // ...........*.. - // ldr q17, [x0, #448] // *............. - // sub v12.8H, v27.8H, v17.8H // ....*......... - // ldr q25, [x0, #192] // ......*....... - // ldr q16, [x0, #0] // ...*.......... - // sqrdmulh v14.8H, v12.8H, v1.H[5] // ..........*... - // sub v8.8H, v16.8H, v18.8H // .........*.... - // add v20.8H, v27.8H, v17.8H // .....*........ - // ldr q17, [x0, #128] // ............*. - // mul v11.8H, v12.8H, v1.H[4] // .......*...... - // sqrdmulh v19.8H, v8.8H, v0.H[7] // .............* - // add v3.8H, v16.8H, v18.8H // ........*..... + // ldr q14, [x0, #384] // *................. + // ldr q26, [x0, #448] // .*................ + // ldr q2, [x0, #320] // ..*............... + // ldr q28, [x0, #64] // .........*........ + // ldr q5, [x0, #256] // ...*.............. + // sub v8.8H, v14.8H, v26.8H // ......*........... + // ldr q3, [x0, #128] // ....*............. + // ldr q20, [x0, #192] // .......*.......... + // add v17.8H, v5.8H, v2.8H // ..........*....... + // sub v9.8H, v3.8H, v20.8H // ............*..... + // ldr q23, [x0, #0] // ........*......... + // sqrdmulh v13.8H, v8.8H, v1.H[5] // ...........*...... + // add v24.8H, v14.8H, v26.8H // .....*............ + // mul v11.8H, v8.8H, v1.H[4] // ..............*... + // add v27.8H, v3.8H, v20.8H // .............*.... + // sqrdmulh v16.8H, v9.8H, v1.H[1] // .................* + // add v25.8H, v23.8H, v28.8H // ...............*.. + // sub v23.8H, v23.8H, v28.8H // ................*. sub count, count, #1 layer123_start: - sub v26.8H, v17.8H, v25.8H // .............*................................................................................ - // gap // .............................................................................................. - ldr q5, [x0, #320] // .....*........................................................................................ - ldr q18, [x0, #80] // .e............................................................................................ - add v4.8H, v17.8H, v25.8H // ..............*............................................................................... - mls v11.8H, v14.8H, v7.H[0] // ...........................*.................................................................. - // gap // .............................................................................................. - ldr q27, [x0, #400] // ......e....................................................................................... - // gap // .............................................................................................. - mul v17.8H, v26.8H, v1.H[0] // ...............*.............................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v21.8H, v10.8H, v5.8H // ..................*........................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - add v23.8H, v10.8H, v5.8H // ...................*.......................................................................... - mul v10.8H, v8.8H, v0.H[6] // ..........*................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v10.8H, v19.8H, v7.H[0] // ............*................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v28.8H, v23.8H, v20.8H // ......................................*....................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - add v13.8H, v23.8H, v20.8H // .......................................*...................................................... - mul v16.8H, v21.8H, v1.H[2] // ....................*......................................................................... - // gap // .............................................................................................. - add v20.8H, v3.8H, v4.8H // .............................*................................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v4.8H, v3.8H, v4.8H // ............................*................................................................. - sqrdmulh v12.8H, v28.8H, v0.H[5] // .........................................*.................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v26.8H, v26.8H, v1.H[1] // ................*............................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqdmulh v6.8H, v20.8H, v7.H[1] // ................................................*............................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v21.8H, v21.8H, v1.H[3] // .....................*........................................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v17.8H, v26.8H, v7.H[0] // .................*............................................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - srshr v25.8H, v6.8H, #11 // .................................................*............................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v26.8H, v4.8H, v0.H[2] // ..............................*............................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v16.8H, v21.8H, v7.H[0] // ......................*....................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v21.8H, v10.8H, v17.8H // .................................*............................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v3.8H, v28.8H, v0.H[4] // ........................................*..................................................... - add v14.8H, v10.8H, v17.8H // ..................................*........................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - ldr q10, [x0, #272] // ....e......................................................................................... - mul v17.8H, v21.8H, v0.H[2] // ...................................*.......................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v8.8H, v16.8H, v11.8H // ...........................................*.................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v22.8H, v21.8H, v0.H[3] // ....................................*......................................................... - add v16.8H, v16.8H, v11.8H // ............................................*................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v21.8H, v8.8H, v0.H[4] // .............................................*................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v19.8H, v14.8H, v16.8H // ...........................................................*.................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v16.8H, v14.8H, v16.8H // ............................................................*................................. - sqrdmulh v31.8H, v8.8H, v0.H[5] // ..............................................*............................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqdmulh v11.8H, v13.8H, v7.H[1] // ...................................................*.......................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v20.8H, v25.8H, v7.H[0] // ..................................................*........................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v4.8H, v4.8H, v0.H[3] // ...............................*.............................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - srshr v2.8H, v11.8H, #11 // ....................................................*......................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v3.8H, v12.8H, v7.H[0] // ..........................................*................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v13.8H, v2.8H, v7.H[0] // .....................................................*........................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v11.8H, v19.8H, v0.H[1] // ..............................................................*............................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v26.8H, v4.8H, v7.H[0] // ................................*............................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v21.8H, v31.8H, v7.H[0] // ...............................................*.............................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v2.8H, v20.8H, v13.8H // ......................................................*....................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - add v23.8H, v20.8H, v13.8H // .......................................................*...................................... - mls v17.8H, v22.8H, v7.H[0] // .....................................*........................................................ - // gap // .............................................................................................. - sub v20.8H, v26.8H, v3.8H // ................................................................*............................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v26.8H, v26.8H, v3.8H // .................................................................*............................ - mul v25.8H, v19.8H, v0.H[0] // .............................................................*................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v5.8H, v2.8H, v0.H[0] // ........................................................*..................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v9.8H, v17.8H, v21.8H // .....................................................................*........................ - // gap // .............................................................................................. - // gap // .............................................................................................. - add v21.8H, v17.8H, v21.8H // ......................................................................*....................... - // gap // .............................................................................................. - sqrdmulh v6.8H, v2.8H, v0.H[1] // .........................................................*.................................... - // gap // .............................................................................................. - ldr q17, [x0, #464] // .......e...................................................................................... - // gap // .............................................................................................. - sqrdmulh v19.8H, v16.8H, v30.8H // ..................................................................................*........... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v8.8H, v26.8H, v30.8H // .....................................................................................*........ - // gap // .............................................................................................. - sub v12.8H, v27.8H, v17.8H // .......................e...................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v5.8H, v6.8H, v7.H[0] // ..........................................................*................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v25.8H, v11.8H, v7.H[0] // ...............................................................*.............................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v4.8H, v20.8H, v0.H[0] // ..................................................................*........................... - // gap // .............................................................................................. - // gap // .............................................................................................. - str q5, [x0, #256] // ..........................................................................*................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v20.8H, v20.8H, v0.H[1] // ...................................................................*.......................... - // gap // .............................................................................................. - // gap // .............................................................................................. - str q25, [x0, #320] // ...........................................................................*.................. - // gap // .............................................................................................. - // gap // .............................................................................................. - ldr q25, [x0, #208] // ...e.......................................................................................... - mul v11.8H, v9.8H, v0.H[0] // .......................................................................*...................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v2.8H, v9.8H, v0.H[1] // ........................................................................*..................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v4.8H, v20.8H, v7.H[0] // ....................................................................*......................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v20.8H, v23.8H, v29.8H // ..............................................................................*............... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v11.8H, v2.8H, v7.H[0] // .........................................................................*.................... - // gap // .............................................................................................. - str q4, [x0, #384] // ............................................................................*................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v2.8H, v16.8H, v29.8H // .................................................................................*............ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v2.8H, v19.8H, v7.H[0] // ...................................................................................*.......... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - str q11, [x0, #448] // .............................................................................*................ - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v9.8H, v23.8H, v30.8H // ...............................................................................*.............. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v23.8H, v26.8H, v29.8H // ....................................................................................*......... - // gap // .............................................................................................. - // gap // .............................................................................................. - str q2, [x0, #64] // ...........................................................................................*.. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v16.8H, v21.8H, v30.8H // ........................................................................................*..... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v2.8H, v21.8H, v29.8H // .......................................................................................*...... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v20.8H, v9.8H, v7.H[0] // ................................................................................*............. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v2.8H, v16.8H, v7.H[0] // .........................................................................................*.... - ldr q16, [x0, #16] // e............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v23.8H, v8.8H, v7.H[0] // ......................................................................................*....... - // gap // .............................................................................................. - // gap // .............................................................................................. - str q20, [x0], #(16) // ..........................................................................................*... - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v14.8H, v12.8H, v1.H[5] // ..........................e................................................................... - sub v8.8H, v16.8H, v18.8H // ........e..................................................................................... - // gap // .............................................................................................. - add v20.8H, v27.8H, v17.8H // ........................e..................................................................... - ldr q17, [x0, #128] // ..e........................................................................................... - // gap // .............................................................................................. - mul v11.8H, v12.8H, v1.H[4] // .........................e.................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - str q2, [x0, #176] // .............................................................................................* - // gap // .............................................................................................. - sqrdmulh v19.8H, v8.8H, v0.H[7] // ...........e.................................................................................. - str q23, [x0, #112] // ............................................................................................*. - add v3.8H, v16.8H, v18.8H // .........e.................................................................................... + mls v11.8H, v13.8H, v7.H[0] // ...........................*............................................................ + sub v28.8H, v5.8H, v2.8H // ..................*..................................................................... + // gap // ........................................................................................ + ldr q14, [x0, #400] // ......e................................................................................. + // gap // ........................................................................................ + sub v5.8H, v25.8H, v27.8H // ............................*........................................................... + mul v19.8H, v23.8H, v0.H[6] // ..........*............................................................................. + add v27.8H, v25.8H, v27.8H // .............................*.......................................................... + // gap // ........................................................................................ + add v20.8H, v17.8H, v24.8H // .......................................*................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v6.8H, v17.8H, v24.8H // ......................................*................................................. + // gap // ........................................................................................ + sqrdmulh v31.8H, v28.8H, v1.H[3] // .....................*.................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v22.8H, v23.8H, v0.H[7] // ...........*............................................................................ + // gap // ........................................................................................ + sub v4.8H, v27.8H, v20.8H // ................................................*....................................... + add v27.8H, v27.8H, v20.8H // .................................................*...................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v13.8H, v6.8H, v0.H[4] // ........................................*............................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v28.8H, v28.8H, v1.H[2] // ....................*................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v20.8H, v9.8H, v1.H[0] // ...............*........................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v20.8H, v16.8H, v7.H[0] // .................*...................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v24.8H, v5.8H, v0.H[2] // ..............................*......................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v19.8H, v22.8H, v7.H[0] // ............*........................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v28.8H, v31.8H, v7.H[0] // ......................*................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v22.8H, v6.8H, v0.H[5] // .........................................*.............................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v26.8H, v19.8H, v20.8H // .................................*...................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + add v18.8H, v19.8H, v20.8H // ..................................*..................................................... + sqrdmulh v19.8H, v5.8H, v0.H[3] // ...............................*........................................................ + // gap // ........................................................................................ + add v23.8H, v28.8H, v11.8H // ............................................*........................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v10.8H, v28.8H, v11.8H // ...........................................*............................................ + sqrdmulh v11.8H, v4.8H, v0.H[1] // ...................................................*.................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v6.8H, v18.8H, v23.8H // ......................................................*................................. + // gap // ........................................................................................ + mul v8.8H, v26.8H, v0.H[2] // ...................................*.................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v28.8H, v26.8H, v0.H[3] // ....................................*................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q26, [x0, #464] // .......e................................................................................ + // gap // ........................................................................................ + mls v13.8H, v22.8H, v7.H[0] // ..........................................*............................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v22.8H, v4.8H, v0.H[0] // ..................................................*..................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v24.8H, v19.8H, v7.H[0] // ................................*....................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v8.8H, v28.8H, v7.H[0] // .....................................*.................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v28.8H, v10.8H, v0.H[4] // .............................................*.......................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + add v5.8H, v24.8H, v13.8H // ...........................................................*............................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v22.8H, v11.8H, v7.H[0] // ....................................................*................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v3.8H, v24.8H, v13.8H // ..........................................................*............................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v24.8H, v18.8H, v23.8H // .....................................................*.................................. + sqrdmulh v23.8H, v10.8H, v0.H[5] // ..............................................*......................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v25.8H, v3.8H, v0.H[1] // .............................................................*.......................... + // gap // ........................................................................................ + // gap // ........................................................................................ + str q22, [x0, #256] // ....................................................................*................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v22.8H, v3.8H, v0.H[0] // ............................................................*........................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v28.8H, v23.8H, v7.H[0] // ...............................................*........................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v22.8H, v25.8H, v7.H[0] // ..............................................................*......................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v3.8H, v27.8H, v29.8H // ........................................................................*............... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v27.8H, v27.8H, v30.8H // .........................................................................*.............. + // gap // ........................................................................................ + // gap // ........................................................................................ + str q22, [x0, #384] // ......................................................................*................. + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v22.8H, v6.8H, v29.8H // ...........................................................................*............ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v10.8H, v8.8H, v28.8H // ...............................................................*........................ + sqrdmulh v19.8H, v6.8H, v30.8H // ............................................................................*........... + add v28.8H, v8.8H, v28.8H // ................................................................*....................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v3.8H, v27.8H, v7.H[0] // ..........................................................................*............. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v23.8H, v24.8H, v0.H[1] // ........................................................*............................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v20.8H, v10.8H, v0.H[1] // ..................................................................*..................... + // gap // ........................................................................................ + // gap // ........................................................................................ + str q3, [x0], #(16) // ....................................................................................*... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v11.8H, v24.8H, v0.H[0] // .......................................................*................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v11.8H, v23.8H, v7.H[0] // .........................................................*.............................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v23.8H, v5.8H, v30.8H // ...............................................................................*........ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v27.8H, v5.8H, v29.8H // ..............................................................................*......... + // gap // ........................................................................................ + // gap // ........................................................................................ + str q11, [x0, #304] // .....................................................................*.................. + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v24.8H, v10.8H, v0.H[0] // .................................................................*...................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v8.8H, v28.8H, v30.8H // ..................................................................................*..... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v27.8H, v23.8H, v7.H[0] // ................................................................................*....... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v23.8H, v28.8H, v29.8H // .................................................................................*...... + ldr q2, [x0, #320] // .....e.................................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q28, [x0, #64] // .e...................................................................................... + // gap // ........................................................................................ + mls v23.8H, v8.8H, v7.H[0] // ...................................................................................*.... + // gap // ........................................................................................ + ldr q5, [x0, #256] // ....e................................................................................... + str q27, [x0, #112] // ......................................................................................*. + sub v8.8H, v14.8H, v26.8H // .......................e................................................................ + ldr q3, [x0, #128] // ..e..................................................................................... + mls v24.8H, v20.8H, v7.H[0] // ...................................................................*.................... + ldr q20, [x0, #192] // ...e.................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v22.8H, v19.8H, v7.H[0] // .............................................................................*.......... + // gap // ........................................................................................ + add v17.8H, v5.8H, v2.8H // ...................e.................................................................... + str q23, [x0, #176] // .......................................................................................* + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v9.8H, v3.8H, v20.8H // .............e.......................................................................... + ldr q23, [x0, #0] // e....................................................................................... + sqrdmulh v13.8H, v8.8H, v1.H[5] // ..........................e............................................................. + str q24, [x0, #432] // .......................................................................*................ + add v24.8H, v14.8H, v26.8H // ........................e............................................................... + // gap // ........................................................................................ + mul v11.8H, v8.8H, v1.H[4] // .........................e.............................................................. + add v27.8H, v3.8H, v20.8H // ..............e......................................................................... + // gap // ........................................................................................ + str q22, [x0, #48] // .....................................................................................*.. + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v16.8H, v9.8H, v1.H[1] // ................e....................................................................... + add v25.8H, v23.8H, v28.8H // .........e.............................................................................. + // gap // ........................................................................................ + sub v23.8H, v23.8H, v28.8H // ........e............................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ // original source code - // ldr q8, [x0, #0] // ................................................................................e...........|.................................................................................e.......... - // ldr q9, [x0, #(1*(512/8))] // e...........................................................................................|.e.......................................................................................... - // ldr q10, [x0, #(2*(512/8))] // ......................................................................................e.....|.......................................................................................e.... - // ldr q11, [x0, #(3*(512/8))] // ...............................................................e............................|................................................................e........................... - // ldr q12, [x0, #(4*(512/8))] // .........................e..................................................................|..........................e................................................................. - // ldr q13, [x0, #(5*(512/8))] // ............................................................................................|*........................................................................................... - // ldr q14, [x0, #(6*(512/8))] // ...e........................................................................................|....e....................................................................................... - // ldr q15, [x0, #(7*(512/8))] // .....................................................e......................................|......................................................e..................................... - // sub v24.8h, v8.8h, v9.8h // ....................................................................................e.......|.....................................................................................e...... - // add v8.8h, v8.8h, v9.8h // ...........................................................................................e|............................................................................................ - // mul v9.8h, v24.8h, v0.h[6] // .......*....................................................................................|........*................................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[7] // .........................................................................................e..|..........................................................................................e. - // mls v9.8h, v24.8h, v7.h[0] // ........*...................................................................................|.........*.................................................................................. - // sub v24.8h, v10.8h, v11.8h // ............................................................................................*............................................................................................ - // add v10.8h, v10.8h, v11.8h // .*..........................................................................................|..*......................................................................................... - // mul v11.8h, v24.8h, v1.h[0] // ....*.......................................................................................|.....*...................................................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[1] // ...............*............................................................................|................*........................................................................... - // mls v11.8h, v24.8h, v7.h[0] // ..................*.........................................................................|...................*........................................................................ - // sub v24.8h, v12.8h, v13.8h // .....*......................................................................................|......*..................................................................................... - // add v12.8h, v12.8h, v13.8h // ......*.....................................................................................|.......*.................................................................................... - // mul v13.8h, v24.8h, v1.h[2] // ...........*................................................................................|............*............................................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[3] // .................*..........................................................................|..................*......................................................................... - // mls v13.8h, v24.8h, v7.h[0] // .....................*......................................................................|......................*..................................................................... - // sub v24.8h, v14.8h, v15.8h // ........................................................e...................................|.........................................................e.................................. - // add v14.8h, v14.8h, v15.8h // .....................................................................................e......|......................................................................................e..... - // mul v15.8h, v24.8h, v1.h[4] // .......................................................................................e....|........................................................................................e... - // sqrdmulh v24.8h, v24.8h, v1.h[5] // ...................................................................................e........|....................................................................................e....... - // mls v15.8h, v24.8h, v7.h[0] // ..*.........................................................................................|...*........................................................................................ - // sub v24.8h, v8.8h, v10.8h // .............*..............................................................................|..............*............................................................................. - // add v8.8h, v8.8h, v10.8h // ............*...............................................................................|.............*.............................................................................. - // mul v10.8h, v24.8h, v0.h[2] // ....................*.......................................................................|.....................*...................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ....................................*.......................................................|.....................................*...................................................... - // mls v10.8h, v24.8h, v7.h[0] // .........................................*..................................................|..........................................*................................................. - // sub v24.8h, v9.8h, v11.8h // ......................*.....................................................................|.......................*.................................................................... - // add v9.8h, v9.8h, v11.8h // ........................*...................................................................|.........................*.................................................................. - // mul v11.8h, v24.8h, v0.h[2] // ..........................*.................................................................|...........................*................................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ............................*...............................................................|.............................*.............................................................. - // mls v11.8h, v24.8h, v7.h[0] // .............................................*..............................................|..............................................*............................................. - // sub v24.8h, v12.8h, v14.8h // .........*..................................................................................|..........*................................................................................. - // add v12.8h, v12.8h, v14.8h // ..........*.................................................................................|...........*................................................................................ - // mul v14.8h, v24.8h, v0.h[4] // .......................*....................................................................|........................*................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..............*.............................................................................|...............*............................................................................ - // mls v14.8h, v24.8h, v7.h[0] // ......................................*.....................................................|.......................................*.................................................... - // sub v24.8h, v13.8h, v15.8h // ...........................*................................................................|............................*............................................................... - // add v13.8h, v13.8h, v15.8h // .............................*..............................................................|..............................*............................................................. - // mul v15.8h, v24.8h, v0.h[4] // ..............................*.............................................................|...............................*............................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[5] // .................................*..........................................................|..................................*......................................................... - // mls v15.8h, v24.8h, v7.h[0] // ..........................................*.................................................|...........................................*................................................ - // sqdmulh v25.8h, v8.8h, v7.h[1] // ................*...........................................................................|.................*.......................................................................... - // srshr v25.8h, v25.8h, #11 // ...................*........................................................................|....................*....................................................................... - // mls v8.8h, v25.8h, v7.h[0] // ...................................*........................................................|....................................*....................................................... - // sqdmulh v25.8h, v12.8h, v7.h[1] // ..................................*.........................................................|...................................*........................................................ - // srshr v25.8h, v25.8h, #11 // .....................................*......................................................|......................................*..................................................... - // mls v12.8h, v25.8h, v7.h[0] // .......................................*....................................................|........................................*................................................... - // sub v24.8h, v8.8h, v12.8h // ...........................................*................................................|............................................*............................................... - // add v8.8h, v8.8h, v12.8h // ............................................*...............................................|.............................................*.............................................. - // mul v12.8h, v24.8h, v0.h[0] // .................................................*..........................................|..................................................*......................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ....................................................*.......................................|.....................................................*...................................... - // mls v12.8h, v24.8h, v7.h[0] // .........................................................*..................................|..........................................................*................................. - // sub v24.8h, v9.8h, v13.8h // ...............................*............................................................|................................*........................................................... - // add v9.8h, v9.8h, v13.8h // ................................*...........................................................|.................................*.......................................................... - // mul v13.8h, v24.8h, v0.h[0] // ................................................*...........................................|.................................................*.......................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................................*...................................................|.........................................*.................................................. - // mls v13.8h, v24.8h, v7.h[0] // ..........................................................*.................................|...........................................................*................................ - // sub v24.8h, v10.8h, v14.8h // ..............................................*.............................................|...............................................*............................................ - // add v10.8h, v10.8h, v14.8h // ...............................................*............................................|................................................*........................................... - // mul v14.8h, v24.8h, v0.h[0] // ...........................................................*................................|............................................................*............................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .............................................................*..............................|..............................................................*............................. - // mls v14.8h, v24.8h, v7.h[0] // ..................................................................*.........................|...................................................................*........................ - // sub v24.8h, v11.8h, v15.8h // ..................................................*.........................................|...................................................*........................................ - // add v11.8h, v11.8h, v15.8h // ...................................................*........................................|....................................................*....................................... - // mul v15.8h, v24.8h, v0.h[0] // ................................................................*...........................|.................................................................*.......................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .................................................................*..........................|..................................................................*......................... - // mls v15.8h, v24.8h, v7.h[0] // ....................................................................*.......................|.....................................................................*...................... - // str q12, [x0, #(4*(512/8))] // ............................................................*...............................|.............................................................*.............................. - // str q13, [x0, #(5*(512/8))] // ..............................................................*.............................|...............................................................*............................ - // str q14, [x0, #(6*(512/8))] // .....................................................................*......................|......................................................................*..................... - // str q15, [x0, #(7*(512/8))] // ........................................................................*...................|.........................................................................*.................. - // mul v12.8h, v8.8h, v29.8h // ...................................................................*........................|....................................................................*....................... - // sqrdmulh v8.8h, v8.8h, v30.8h // .........................................................................*..................|..........................................................................*................. - // mls v12.8h, v8.8h, v7.h[0] // ..............................................................................*.............|...............................................................................*............ - // mul v13.8h, v9.8h, v29.8h // ......................................................................*.....................|.......................................................................*.................... - // sqrdmulh v9.8h, v9.8h, v30.8h // ......................................................*.....................................|.......................................................*.................................... - // mls v13.8h, v9.8h, v7.h[0] // .......................................................................*....................|........................................................................*................... - // mul v14.8h, v10.8h, v29.8h // ..........................................................................*.................|...........................................................................*................ - // sqrdmulh v10.8h, v10.8h, v30.8h // .......................................................*....................................|........................................................*................................... - // mls v14.8h, v10.8h, v7.h[0] // .................................................................................*..........|..................................................................................*......... - // mul v15.8h, v11.8h, v29.8h // .............................................................................*..............|..............................................................................*............. - // sqrdmulh v11.8h, v11.8h, v30.8h // ............................................................................*...............|.............................................................................*.............. - // mls v15.8h, v11.8h, v7.h[0] // ...............................................................................*............|................................................................................*........... - // str q12, [x0], #(16) // ..................................................................................*.........|...................................................................................*........ - // str q13, [x0, #(-16 + 1*(512/8))] // ...........................................................................*................|............................................................................*............... - // str q14, [x0, #(-16 + 2*(512/8))] // ..........................................................................................*.|...........................................................................................* - // str q15, [x0, #(-16 + 3*(512/8))] // ........................................................................................*...|.........................................................................................*.. + // ldr q8, [x0, #0] // ............................................................................e.........|.............................................................................e...... + // ldr q9, [x0, #(1*(512/8))] // ................................................................e.....................|.................................................................e.................. + // ldr q10, [x0, #(2*(512/8))] // .....................................................................e................|......................................................................e............. + // ldr q11, [x0, #(3*(512/8))] // .......................................................................e..............|........................................................................e........... + // ldr q12, [x0, #(4*(512/8))] // ..................................................................e...................|...................................................................e................ + // ldr q13, [x0, #(5*(512/8))] // ...............................................................e......................|................................................................e................... + // ldr q14, [x0, #(6*(512/8))] // e.....................................................................................|.e.................................................................................. + // ldr q15, [x0, #(7*(512/8))] // ...........................e..........................................................|............................e....................................................... + // sub v24.8h, v8.8h, v9.8h // .....................................................................................e|.................................................................................... + // add v8.8h, v8.8h, v9.8h // ....................................................................................e.|.................................................................................... + // mul v9.8h, v24.8h, v0.h[6] // ..*...................................................................................|...*................................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[7] // .......*..............................................................................|........*........................................................................... + // mls v9.8h, v24.8h, v7.h[0] // ...............*......................................................................|................*................................................................... + // sub v24.8h, v10.8h, v11.8h // ...........................................................................e..........|............................................................................e....... + // add v10.8h, v10.8h, v11.8h // .................................................................................e....|..................................................................................e. + // mul v11.8h, v24.8h, v1.h[0] // ............*.........................................................................|.............*...................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ...................................................................................e..|.................................................................................... + // mls v11.8h, v24.8h, v7.h[0] // .............*........................................................................|..............*..................................................................... + // sub v24.8h, v12.8h, v13.8h // ......................................................................................|*................................................................................... + // add v12.8h, v12.8h, v13.8h // .........................................................................e............|..........................................................................e......... + // mul v13.8h, v24.8h, v1.h[2] // ...........*..........................................................................|............*....................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // ......*...............................................................................|.......*............................................................................ + // mls v13.8h, v24.8h, v7.h[0] // ................*.....................................................................|.................*.................................................................. + // sub v24.8h, v14.8h, v15.8h // ....................................................................e.................|.....................................................................e.............. + // add v14.8h, v14.8h, v15.8h // ...............................................................................e......|................................................................................e... + // mul v15.8h, v24.8h, v1.h[4] // ................................................................................e.....|.................................................................................e.. + // sqrdmulh v24.8h, v24.8h, v1.h[5] // .............................................................................e........|..............................................................................e..... + // mls v15.8h, v24.8h, v7.h[0] // ......................................................................................*.................................................................................... + // sub v24.8h, v8.8h, v10.8h // .*....................................................................................|..*................................................................................. + // add v8.8h, v8.8h, v10.8h // ...*..................................................................................|....*............................................................................... + // mul v10.8h, v24.8h, v0.h[2] // ..............*.......................................................................|...............*.................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ....................*.................................................................|.....................*.............................................................. + // mls v10.8h, v24.8h, v7.h[0] // ..............................*.......................................................|...............................*.................................................... + // sub v24.8h, v9.8h, v11.8h // ..................*...................................................................|...................*................................................................ + // add v9.8h, v9.8h, v11.8h // ...................*..................................................................|....................*............................................................... + // mul v11.8h, v24.8h, v0.h[2] // .........................*............................................................|..........................*......................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..........................*...........................................................|...........................*........................................................ + // mls v11.8h, v24.8h, v7.h[0] // ...............................*......................................................|................................*................................................... + // sub v24.8h, v12.8h, v14.8h // .....*................................................................................|......*............................................................................. + // add v12.8h, v12.8h, v14.8h // ....*.................................................................................|.....*.............................................................................. + // mul v14.8h, v24.8h, v0.h[4] // ..........*...........................................................................|...........*........................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .................*....................................................................|..................*................................................................. + // mls v14.8h, v24.8h, v7.h[0] // ............................*.........................................................|.............................*...................................................... + // sub v24.8h, v13.8h, v15.8h // ......................*...............................................................|.......................*............................................................ + // add v13.8h, v13.8h, v15.8h // .....................*................................................................|......................*............................................................. + // mul v15.8h, v24.8h, v0.h[4] // ................................*.....................................................|.................................*.................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .....................................*................................................|......................................*............................................. + // mls v15.8h, v24.8h, v7.h[0] // .........................................*............................................|..........................................*......................................... + // sub v24.8h, v8.8h, v12.8h // ........*.............................................................................|.........*.......................................................................... + // add v8.8h, v8.8h, v12.8h // .........*............................................................................|..........*......................................................................... + // mul v12.8h, v24.8h, v0.h[0] // .............................*........................................................|..............................*..................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................*..............................................................|........................*........................................................... + // mls v12.8h, v24.8h, v7.h[0] // ..................................*...................................................|...................................*................................................ + // sub v24.8h, v9.8h, v13.8h // ....................................*.................................................|.....................................*.............................................. + // add v9.8h, v9.8h, v13.8h // ........................*.............................................................|.........................*.......................................................... + // mul v13.8h, v24.8h, v0.h[0] // ......................................................*...............................|.......................................................*............................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...................................................*..................................|....................................................*............................... + // mls v13.8h, v24.8h, v7.h[0] // .......................................................*..............................|........................................................*........................... + // sub v24.8h, v10.8h, v14.8h // ...................................*..................................................|....................................*............................................... + // add v10.8h, v10.8h, v14.8h // .................................*....................................................|..................................*................................................. + // mul v14.8h, v24.8h, v0.h[0] // ........................................*.............................................|.........................................*.......................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ......................................*...............................................|.......................................*............................................ + // mls v14.8h, v24.8h, v7.h[0] // ..........................................*...........................................|...........................................*........................................ + // sub v24.8h, v11.8h, v15.8h // ...............................................*......................................|................................................*................................... + // add v11.8h, v11.8h, v15.8h // .................................................*....................................|..................................................*................................. + // mul v15.8h, v24.8h, v0.h[0] // ...........................................................*..........................|............................................................*....................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ....................................................*.................................|.....................................................*.............................. + // mls v15.8h, v24.8h, v7.h[0] // ......................................................................*...............|.......................................................................*............ + // str q12, [x0, #(4*(512/8))] // .......................................*..............................................|........................................*........................................... + // str q13, [x0, #(5*(512/8))] // ..........................................................*...........................|...........................................................*........................ + // str q14, [x0, #(6*(512/8))] // .............................................*........................................|..............................................*..................................... + // str q15, [x0, #(7*(512/8))] // ..............................................................................*.......|...............................................................................*.... + // mul v12.8h, v8.8h, v29.8h // ...........................................*..........................................|............................................*....................................... + // sqrdmulh v8.8h, v8.8h, v30.8h // ............................................*.........................................|.............................................*...................................... + // mls v12.8h, v8.8h, v7.h[0] // ..................................................*...................................|...................................................*................................ + // mul v13.8h, v9.8h, v29.8h // ..............................................*.......................................|...............................................*.................................... + // sqrdmulh v9.8h, v9.8h, v30.8h // ................................................*.....................................|.................................................*.................................. + // mls v13.8h, v9.8h, v7.h[0] // ........................................................................*.............|.........................................................................*.......... + // mul v14.8h, v10.8h, v29.8h // .........................................................*............................|..........................................................*......................... + // sqrdmulh v10.8h, v10.8h, v30.8h // ........................................................*.............................|.........................................................*.......................... + // mls v14.8h, v10.8h, v7.h[0] // .............................................................*........................|..............................................................*..................... + // mul v15.8h, v11.8h, v29.8h // ..............................................................*.......................|...............................................................*.................... + // sqrdmulh v11.8h, v11.8h, v30.8h // ............................................................*.........................|.............................................................*...................... + // mls v15.8h, v11.8h, v7.h[0] // .................................................................*....................|..................................................................*................. + // str q12, [x0], #(16) // .....................................................*................................|......................................................*............................. + // str q13, [x0, #(-16 + 1*(512/8))] // ..................................................................................*...|...................................................................................* + // str q14, [x0, #(-16 + 2*(512/8))] // ...................................................................*..................|....................................................................*............... + // str q15, [x0, #(-16 + 3*(512/8))] // ..........................................................................*...........|...........................................................................*........ sub count, count, #1 cbnz count, layer123_start - sub v15.8H, v17.8H, v25.8H // *............................................................................... - ldr q2, [x0, #320] // .*.............................................................................. - mul v21.8H, v8.8H, v0.H[6] // .......*........................................................................ - add v31.8H, v17.8H, v25.8H // ..*............................................................................. - // gap // ................................................................................ - // gap // ................................................................................ - mls v21.8H, v19.8H, v7.H[0] // ........*....................................................................... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mul v4.8H, v15.8H, v1.H[0] // ....*........................................................................... - sub v13.8H, v10.8H, v2.8H // .....*.......................................................................... - // gap // ................................................................................ - add v26.8H, v3.8H, v31.8H // ............*................................................................... - // gap // ................................................................................ - // gap // ................................................................................ - sqrdmulh v23.8H, v15.8H, v1.H[1] // ...............*................................................................ - add v2.8H, v10.8H, v2.8H // ......*......................................................................... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - sqdmulh v17.8H, v26.8H, v7.H[1] // ................*............................................................... - // gap // ................................................................................ - // gap // ................................................................................ - sub v25.8H, v2.8H, v20.8H // .........*...................................................................... - // gap // ................................................................................ - // gap // ................................................................................ - mul v16.8H, v13.8H, v1.H[2] // ...........*.................................................................... - add v20.8H, v2.8H, v20.8H // ..........*..................................................................... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mls v4.8H, v23.8H, v7.H[0] // ..................*............................................................. - // gap // ................................................................................ - // gap // ................................................................................ - srshr v17.8H, v17.8H, #11 // ...................*............................................................ - // gap // ................................................................................ - // gap // ................................................................................ - sqrdmulh v2.8H, v13.8H, v1.H[3] // .................*.............................................................. - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mls v26.8H, v17.8H, v7.H[0] // ..................................*............................................. - // gap // ................................................................................ - // gap // ................................................................................ - sub v13.8H, v21.8H, v4.8H // ......................*......................................................... - // gap // ................................................................................ - // gap // ................................................................................ - sqrdmulh v17.8H, v25.8H, v0.H[5] // ..............*................................................................. - // gap // ................................................................................ - add v21.8H, v21.8H, v4.8H // ........................*....................................................... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mul v25.8H, v25.8H, v0.H[4] // .......................*........................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mls v11.8H, v14.8H, v7.H[0] // ...*............................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mls v16.8H, v2.8H, v7.H[0] // .....................*.......................................................... - sub v2.8H, v3.8H, v31.8H // .............*.................................................................. - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mul v3.8H, v13.8H, v0.H[2] // .........................*...................................................... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - sqrdmulh v13.8H, v13.8H, v0.H[3] // ...........................*.................................................... - // gap // ................................................................................ - // gap // ................................................................................ - sub v23.8H, v16.8H, v11.8H // ..........................*..................................................... - // gap // ................................................................................ - // gap // ................................................................................ - add v16.8H, v16.8H, v11.8H // ............................*................................................... - mul v11.8H, v2.8H, v0.H[2] // ....................*........................................................... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mul v10.8H, v23.8H, v0.H[4] // .............................*.................................................. - // gap // ................................................................................ - // gap // ................................................................................ - sub v4.8H, v21.8H, v16.8H // ..............................*................................................. - add v16.8H, v21.8H, v16.8H // ...............................*................................................ - // gap // ................................................................................ - sqdmulh v21.8H, v20.8H, v7.H[1] // .................................*.............................................. - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - sqrdmulh v23.8H, v23.8H, v0.H[5] // ................................*............................................... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - sqrdmulh v2.8H, v2.8H, v0.H[3] // ...................................*............................................ - srshr v21.8H, v21.8H, #11 // ....................................*........................................... - // gap // ................................................................................ - // gap // ................................................................................ - mls v25.8H, v17.8H, v7.H[0] // .....................................*.......................................... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mls v20.8H, v21.8H, v7.H[0] // ......................................*......................................... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mls v11.8H, v2.8H, v7.H[0] // ........................................*....................................... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mls v10.8H, v23.8H, v7.H[0] // .........................................*...................................... - // gap // ................................................................................ - // gap // ................................................................................ - sub v21.8H, v26.8H, v20.8H // ..........................................*..................................... - // gap // ................................................................................ - // gap // ................................................................................ - add v26.8H, v26.8H, v20.8H // ...........................................*.................................... - // gap // ................................................................................ - sqrdmulh v20.8H, v4.8H, v0.H[1] // .......................................*........................................ - sub v2.8H, v11.8H, v25.8H // .............................................*.................................. - // gap // ................................................................................ - // gap // ................................................................................ - mls v3.8H, v13.8H, v7.H[0] // ............................................*................................... - add v17.8H, v11.8H, v25.8H // ..............................................*................................. - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mul v23.8H, v4.8H, v0.H[0] // ...............................................*................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mul v4.8H, v21.8H, v0.H[0] // ................................................*............................... - // gap // ................................................................................ - // gap // ................................................................................ - sub v25.8H, v3.8H, v10.8H // .................................................*.............................. - // gap // ................................................................................ - // gap // ................................................................................ - add v11.8H, v3.8H, v10.8H // ..................................................*............................. - sqrdmulh v21.8H, v21.8H, v0.H[1] // ...................................................*............................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - sqrdmulh v13.8H, v16.8H, v30.8H // ....................................................*........................... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - sqrdmulh v3.8H, v17.8H, v30.8H // .....................................................*.......................... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mls v4.8H, v21.8H, v7.H[0] // ......................................................*......................... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mls v23.8H, v20.8H, v7.H[0] // .......................................................*........................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mul v21.8H, v2.8H, v0.H[0] // ........................................................*....................... - str q4, [x0, #256] // .........................................................*...................... - // gap // ................................................................................ - // gap // ................................................................................ - sqrdmulh v2.8H, v2.8H, v0.H[1] // ..........................................................*..................... - // gap // ................................................................................ - // gap // ................................................................................ - str q23, [x0, #320] // ...........................................................*.................... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mul v23.8H, v25.8H, v0.H[0] // ............................................................*................... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - sqrdmulh v20.8H, v25.8H, v0.H[1] // .............................................................*.................. - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mls v21.8H, v2.8H, v7.H[0] // ..............................................................*................. - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mul v2.8H, v26.8H, v29.8H // ...............................................................*................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mls v23.8H, v20.8H, v7.H[0] // ................................................................*............... - // gap // ................................................................................ - // gap // ................................................................................ - str q21, [x0, #384] // .................................................................*.............. - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mul v16.8H, v16.8H, v29.8H // ..................................................................*............. - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mls v16.8H, v13.8H, v7.H[0] // ...................................................................*............ - // gap // ................................................................................ - // gap // ................................................................................ - str q23, [x0, #448] // ....................................................................*........... - // gap // ................................................................................ - // gap // ................................................................................ - sqrdmulh v23.8H, v26.8H, v30.8H // .....................................................................*.......... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mul v21.8H, v17.8H, v29.8H // ......................................................................*......... - // gap // ................................................................................ - // gap // ................................................................................ - str q16, [x0, #64] // .......................................................................*........ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - sqrdmulh v16.8H, v11.8H, v30.8H // ........................................................................*....... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mul v26.8H, v11.8H, v29.8H // .........................................................................*...... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mls v2.8H, v23.8H, v7.H[0] // ..........................................................................*..... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mls v26.8H, v16.8H, v7.H[0] // ...........................................................................*.... - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - mls v21.8H, v3.8H, v7.H[0] // ............................................................................*... - // gap // ................................................................................ - // gap // ................................................................................ - str q2, [x0], #(16) // .............................................................................*.. - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - str q26, [x0, #176] // ..............................................................................*. - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - // gap // ................................................................................ - str q21, [x0, #112] // ...............................................................................* - // gap // ................................................................................ - // gap // ................................................................................ + mls v11.8H, v13.8H, v7.H[0] // *..................................................................... + sub v28.8H, v17.8H, v24.8H // ......*............................................................... + // gap // ...................................................................... + add v31.8H, v25.8H, v27.8H // ....*................................................................. + // gap // ...................................................................... + // gap // ...................................................................... + sub v10.8H, v25.8H, v27.8H // ..*................................................................... + mul v12.8H, v9.8H, v1.H[0] // .............*........................................................ + // gap // ...................................................................... + add v24.8H, v17.8H, v24.8H // .....*................................................................ + // gap // ...................................................................... + // gap // ...................................................................... + mul v14.8H, v28.8H, v0.H[4] // ...........*.......................................................... + sub v3.8H, v5.8H, v2.8H // .*.................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v8.8H, v28.8H, v0.H[5] // ..................*................................................... + add v19.8H, v31.8H, v24.8H // ..........*........................................................... + // gap // ...................................................................... + sub v15.8H, v31.8H, v24.8H // .........*............................................................ + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v22.8H, v10.8H, v0.H[3] // .....................*................................................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v28.8H, v10.8H, v0.H[2] // ...............*...................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v14.8H, v8.8H, v7.H[0] // ............................*......................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v28.8H, v22.8H, v7.H[0] // ..............................*....................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v2.8H, v23.8H, v0.H[7] // ........*............................................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v12.8H, v16.8H, v7.H[0] // ..............*....................................................... + // gap // ...................................................................... + // gap // ...................................................................... + add v27.8H, v28.8H, v14.8H // .................................*.................................... + // gap // ...................................................................... + // gap // ...................................................................... + sub v18.8H, v28.8H, v14.8H // ...................................*.................................. + mul v28.8H, v23.8H, v0.H[6] // ...*.................................................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v23.8H, v3.8H, v1.H[3] // .......*.............................................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v28.8H, v2.8H, v7.H[0] // ................*..................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v6.8H, v3.8H, v1.H[2] // ............*......................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v6.8H, v23.8H, v7.H[0] // .................*.................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + add v17.8H, v28.8H, v12.8H // ....................*................................................. + mul v23.8H, v27.8H, v29.8H // .........................................................*............ + sub v28.8H, v28.8H, v12.8H // ...................*.................................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v10.8H, v18.8H, v0.H[1] // ......................................*............................... + // gap // ...................................................................... + // gap // ...................................................................... + add v16.8H, v6.8H, v11.8H // ......................*............................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v22.8H, v18.8H, v0.H[0] // ........................................*............................. + sub v9.8H, v6.8H, v11.8H // .......................*.............................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + add v6.8H, v17.8H, v16.8H // .........................*............................................ + sqrdmulh v20.8H, v27.8H, v30.8H // ........................................................*............. + // gap // ...................................................................... + sub v14.8H, v17.8H, v16.8H // ....................................*................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v22.8H, v10.8H, v7.H[0] // ..........................................*........................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v2.8H, v14.8H, v0.H[1] // ...................................................*.................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v11.8H, v6.8H, v29.8H // ..............................................*....................... + // gap // ...................................................................... + // gap // ...................................................................... + str q22, [x0, #384] // .............................................*........................ + // gap // ...................................................................... + // gap // ...................................................................... + mls v23.8H, v20.8H, v7.H[0] // .............................................................*........ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v5.8H, v15.8H, v0.H[0] // .............................*........................................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v22.8H, v28.8H, v0.H[3] // ...........................*.......................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q23, [x0, #128] // ................................................................*..... + // gap // ...................................................................... + // gap // ...................................................................... + mul v3.8H, v28.8H, v0.H[2] // ..........................*........................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v26.8H, v6.8H, v30.8H // ................................................*..................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v3.8H, v22.8H, v7.H[0] // ...............................*...................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v17.8H, v9.8H, v0.H[5] // .....................................*................................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v28.8H, v15.8H, v0.H[1] // ........................*............................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v23.8H, v9.8H, v0.H[4] // ................................*..................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v23.8H, v17.8H, v7.H[0] // .........................................*............................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v11.8H, v26.8H, v7.H[0] // ..................................................................*... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v5.8H, v28.8H, v7.H[0] // ..................................*................................... + // gap // ...................................................................... + // gap // ...................................................................... + sub v24.8H, v3.8H, v23.8H // ...............................................*...................... + // gap // ...................................................................... + // gap // ...................................................................... + add v31.8H, v3.8H, v23.8H // .................................................*.................... + mul v3.8H, v14.8H, v0.H[0] // ......................................................*............... + // gap // ...................................................................... + str q11, [x0, #64] // .....................................................................* + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v4.8H, v24.8H, v0.H[1] // ....................................................*................. + // gap // ...................................................................... + // gap // ...................................................................... + str q5, [x0, #256] // .......................................*.............................. + // gap // ...................................................................... + // gap // ...................................................................... + mul v8.8H, v24.8H, v0.H[0] // ...........................................................*.......... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v27.8H, v19.8H, v30.8H // ............................................*......................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v8.8H, v4.8H, v7.H[0] // .................................................................*.... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v3.8H, v2.8H, v7.H[0] // .......................................................*.............. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v25.8H, v31.8H, v30.8H // ............................................................*......... + str q8, [x0, #448] // ....................................................................*. + // gap // ...................................................................... + // gap // ...................................................................... + mul v12.8H, v31.8H, v29.8H // ..............................................................*....... + // gap // ...................................................................... + // gap // ...................................................................... + str q3, [x0, #320] // ..........................................................*........... + // gap // ...................................................................... + // gap // ...................................................................... + mul v22.8H, v19.8H, v29.8H // ...........................................*.......................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v12.8H, v25.8H, v7.H[0] // ...............................................................*...... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v22.8H, v27.8H, v7.H[0] // ..................................................*................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q12, [x0, #192] // ...................................................................*.. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q22, [x0], #(16) // .....................................................*................ + // gap // ...................................................................... + // gap // ...................................................................... // original source code - // sub v26.8H, v17.8H, v25.8H // *............................................................................... - // ldr q5, [x0, #320] // .*.............................................................................. - // add v4.8H, v17.8H, v25.8H // ...*............................................................................ - // mls v11.8H, v14.8H, v7.H[0] // ......................*......................................................... - // mul v17.8H, v26.8H, v1.H[0] // .....*.......................................................................... - // sub v21.8H, v10.8H, v5.8H // ......*......................................................................... - // add v23.8H, v10.8H, v5.8H // .........*...................................................................... - // mul v10.8H, v8.8H, v0.H[6] // ..*............................................................................. - // mls v10.8H, v19.8H, v7.H[0] // ....*........................................................................... - // sub v28.8H, v23.8H, v20.8H // ...........*.................................................................... - // add v13.8H, v23.8H, v20.8H // .............*.................................................................. - // mul v16.8H, v21.8H, v1.H[2] // ............*................................................................... - // add v20.8H, v3.8H, v4.8H // .......*........................................................................ - // sub v4.8H, v3.8H, v4.8H // ........................*....................................................... - // sqrdmulh v12.8H, v28.8H, v0.H[5] // ...................*............................................................ - // sqrdmulh v26.8H, v26.8H, v1.H[1] // ........*....................................................................... - // sqdmulh v6.8H, v20.8H, v7.H[1] // ..........*..................................................................... - // sqrdmulh v21.8H, v21.8H, v1.H[3] // ................*............................................................... - // mls v17.8H, v26.8H, v7.H[0] // ..............*................................................................. - // srshr v25.8H, v6.8H, #11 // ...............*................................................................ - // mul v26.8H, v4.8H, v0.H[2] // .............................*.................................................. - // mls v16.8H, v21.8H, v7.H[0] // .......................*........................................................ - // sub v21.8H, v10.8H, v17.8H // ..................*............................................................. - // mul v3.8H, v28.8H, v0.H[4] // .....................*.......................................................... - // add v14.8H, v10.8H, v17.8H // ....................*........................................................... - // mul v17.8H, v21.8H, v0.H[2] // .........................*...................................................... - // sub v8.8H, v16.8H, v11.8H // ...........................*.................................................... - // sqrdmulh v22.8H, v21.8H, v0.H[3] // ..........................*..................................................... - // add v16.8H, v16.8H, v11.8H // ............................*................................................... - // mul v21.8H, v8.8H, v0.H[4] // ..............................*................................................. - // sub v19.8H, v14.8H, v16.8H // ...............................*................................................ - // add v16.8H, v14.8H, v16.8H // ................................*............................................... - // sqrdmulh v31.8H, v8.8H, v0.H[5] // ..................................*............................................. - // sqdmulh v11.8H, v13.8H, v7.H[1] // .................................*.............................................. - // mls v20.8H, v25.8H, v7.H[0] // .................*.............................................................. - // sqrdmulh v4.8H, v4.8H, v0.H[3] // ...................................*............................................ - // srshr v2.8H, v11.8H, #11 // ....................................*........................................... - // mls v3.8H, v12.8H, v7.H[0] // .....................................*.......................................... - // mls v13.8H, v2.8H, v7.H[0] // ......................................*......................................... - // sqrdmulh v11.8H, v19.8H, v0.H[1] // ...........................................*.................................... - // mls v26.8H, v4.8H, v7.H[0] // .......................................*........................................ - // mls v21.8H, v31.8H, v7.H[0] // ........................................*....................................... - // sub v2.8H, v20.8H, v13.8H // .........................................*...................................... - // add v23.8H, v20.8H, v13.8H // ..........................................*..................................... - // mls v17.8H, v22.8H, v7.H[0] // .............................................*.................................. - // sub v20.8H, v26.8H, v3.8H // ............................................*................................... - // add v26.8H, v26.8H, v3.8H // ..............................................*................................. - // mul v25.8H, v19.8H, v0.H[0] // ...............................................*................................ - // mul v5.8H, v2.8H, v0.H[0] // ................................................*............................... - // sub v9.8H, v17.8H, v21.8H // .................................................*.............................. - // add v21.8H, v17.8H, v21.8H // ..................................................*............................. - // sqrdmulh v6.8H, v2.8H, v0.H[1] // ...................................................*............................ - // sqrdmulh v19.8H, v16.8H, v30.8H // ....................................................*........................... - // sqrdmulh v8.8H, v26.8H, v30.8H // .....................................................*.......................... - // mls v5.8H, v6.8H, v7.H[0] // ......................................................*......................... - // mls v25.8H, v11.8H, v7.H[0] // .......................................................*........................ - // mul v4.8H, v20.8H, v0.H[0] // ........................................................*....................... - // str q5, [x0, #256] // .........................................................*...................... - // sqrdmulh v20.8H, v20.8H, v0.H[1] // ..........................................................*..................... - // str q25, [x0, #320] // ...........................................................*.................... - // mul v11.8H, v9.8H, v0.H[0] // ............................................................*................... - // sqrdmulh v2.8H, v9.8H, v0.H[1] // .............................................................*.................. - // mls v4.8H, v20.8H, v7.H[0] // ..............................................................*................. - // mul v20.8H, v23.8H, v29.8H // ...............................................................*................ - // mls v11.8H, v2.8H, v7.H[0] // ................................................................*............... - // str q4, [x0, #384] // .................................................................*.............. - // mul v2.8H, v16.8H, v29.8H // ..................................................................*............. - // mls v2.8H, v19.8H, v7.H[0] // ...................................................................*............ - // str q11, [x0, #448] // ....................................................................*........... - // sqrdmulh v9.8H, v23.8H, v30.8H // .....................................................................*.......... - // mul v23.8H, v26.8H, v29.8H // ......................................................................*......... - // str q2, [x0, #64] // .......................................................................*........ - // sqrdmulh v16.8H, v21.8H, v30.8H // ........................................................................*....... - // mul v2.8H, v21.8H, v29.8H // .........................................................................*...... - // mls v20.8H, v9.8H, v7.H[0] // ..........................................................................*..... - // mls v2.8H, v16.8H, v7.H[0] // ...........................................................................*.... - // mls v23.8H, v8.8H, v7.H[0] // ............................................................................*... - // str q20, [x0], #(16) // .............................................................................*.. - // str q2, [x0, #176] // ..............................................................................*. - // str q23, [x0, #112] // ...............................................................................* + // mls v11.8H, v13.8H, v7.H[0] // *..................................................................... + // sub v28.8H, v5.8H, v2.8H // .......*.............................................................. + // sub v5.8H, v25.8H, v27.8H // ...*.................................................................. + // mul v19.8H, v23.8H, v0.H[6] // ...................*.................................................. + // add v27.8H, v25.8H, v27.8H // ..*................................................................... + // add v20.8H, v17.8H, v24.8H // .....*................................................................ + // sub v6.8H, v17.8H, v24.8H // .*.................................................................... + // sqrdmulh v31.8H, v28.8H, v1.H[3] // ....................*................................................. + // sqrdmulh v22.8H, v23.8H, v0.H[7] // ...............*...................................................... + // sub v4.8H, v27.8H, v20.8H // ..........*........................................................... + // add v27.8H, v27.8H, v20.8H // .........*............................................................ + // mul v13.8H, v6.8H, v0.H[4] // ......*............................................................... + // mul v28.8H, v28.8H, v1.H[2] // ......................*............................................... + // mul v20.8H, v9.8H, v1.H[0] // ....*................................................................. + // mls v20.8H, v16.8H, v7.H[0] // ................*..................................................... + // mul v24.8H, v5.8H, v0.H[2] // ............*......................................................... + // mls v19.8H, v22.8H, v7.H[0] // .....................*................................................ + // mls v28.8H, v31.8H, v7.H[0] // .......................*.............................................. + // sqrdmulh v22.8H, v6.8H, v0.H[5] // ........*............................................................. + // sub v26.8H, v19.8H, v20.8H // ..........................*........................................... + // add v18.8H, v19.8H, v20.8H // ........................*............................................. + // sqrdmulh v19.8H, v5.8H, v0.H[3] // ...........*.......................................................... + // add v23.8H, v28.8H, v11.8H // ............................*......................................... + // sub v10.8H, v28.8H, v11.8H // ..............................*....................................... + // sqrdmulh v11.8H, v4.8H, v0.H[1] // ..............................................*....................... + // add v6.8H, v18.8H, v23.8H // ...............................*...................................... + // mul v8.8H, v26.8H, v0.H[2] // ..........................................*........................... + // sqrdmulh v28.8H, v26.8H, v0.H[3] // ........................................*............................. + // mls v13.8H, v22.8H, v7.H[0] // .............*........................................................ + // mul v22.8H, v4.8H, v0.H[0] // .......................................*.............................. + // mls v24.8H, v19.8H, v7.H[0] // ..............*....................................................... + // mls v8.8H, v28.8H, v7.H[0] // ............................................*......................... + // mul v28.8H, v10.8H, v0.H[4] // ...............................................*...................... + // add v5.8H, v24.8H, v13.8H // .................*.................................................... + // mls v22.8H, v11.8H, v7.H[0] // ..................................................*................... + // sub v3.8H, v24.8H, v13.8H // ..................*................................................... + // sub v24.8H, v18.8H, v23.8H // .................................*.................................... + // sqrdmulh v23.8H, v10.8H, v0.H[5] // .............................................*........................ + // sqrdmulh v25.8H, v3.8H, v0.H[1] // ...........................*.......................................... + // str q22, [x0, #256] // ........................................................*............. + // mul v22.8H, v3.8H, v0.H[0] // .............................*........................................ + // mls v28.8H, v23.8H, v7.H[0] // ................................................*..................... + // mls v22.8H, v25.8H, v7.H[0] // ..................................*................................... + // mul v3.8H, v27.8H, v29.8H // .................................................................*.... + // sqrdmulh v27.8H, v27.8H, v30.8H // ..........................................................*........... + // str q22, [x0, #384] // .....................................*................................ + // mul v22.8H, v6.8H, v29.8H // ....................................*................................. + // sub v10.8H, v8.8H, v28.8H // ...................................................*.................. + // sqrdmulh v19.8H, v6.8H, v30.8H // ...........................................*.......................... + // add v28.8H, v8.8H, v28.8H // ....................................................*................. + // mls v3.8H, v27.8H, v7.H[0] // ...................................................................*.. + // sqrdmulh v23.8H, v24.8H, v0.H[1] // ...................................*.................................. + // sqrdmulh v20.8H, v10.8H, v0.H[1] // .......................................................*.............. + // str q3, [x0], #(16) // .....................................................................* + // mul v11.8H, v24.8H, v0.H[0] // .....................................................*................ + // mls v11.8H, v23.8H, v7.H[0] // ............................................................*......... + // sqrdmulh v23.8H, v5.8H, v30.8H // ................................*..................................... + // mul v27.8H, v5.8H, v29.8H // .........................*............................................ + // str q11, [x0, #304] // ................................................................*..... + // mul v24.8H, v10.8H, v0.H[0] // .........................................................*............ + // sqrdmulh v8.8H, v28.8H, v30.8H // .............................................................*........ + // mls v27.8H, v23.8H, v7.H[0] // ......................................*............................... + // mul v23.8H, v28.8H, v29.8H // ...............................................................*...... + // mls v23.8H, v8.8H, v7.H[0] // ..................................................................*... + // str q27, [x0, #112] // .........................................*............................ + // mls v24.8H, v20.8H, v7.H[0] // ...........................................................*.......... + // mls v22.8H, v19.8H, v7.H[0] // .................................................*.................... + // str q23, [x0, #176] // ....................................................................*. + // str q24, [x0, #432] // ..............................................................*....... + // str q22, [x0, #48] // ......................................................*............... pop_stack diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_m1_firestorm.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_m1_firestorm.s index e46bc5d..d3209bc 100644 --- a/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_m1_firestorm.s +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_m1_firestorm.s @@ -354,748 +354,840 @@ _intt_kyber_123_4567_opt_m1_firestorm: mov count, #8 .p2align 2 - ldr q29, [x1, #16] // ...*................................... - ldr q13, [x1, #0] // ....*.................................. - ldr q3, [x1, #32] // .*..................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - ldr q8, [x1, #48] // *...................................... - ldr q20, [x4, #80] // ......*................................ - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - ldr q28, [x4, #16] // .........*............................. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - ldr q1, [x4, #48] // .................*..................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - trn1 v30.4S, v13.4S, v29.4S // ..........*............................ - trn2 v29.4S, v13.4S, v29.4S // ...........*........................... - ldr q13, [x4, #64] // .....*................................. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - trn2 v10.4S, v3.4S, v8.4S // .......*............................... - trn1 v9.4S, v3.4S, v8.4S // ........*.............................. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - trn1 v26.2D, v29.2D, v10.2D // ...............*....................... - trn2 v24.2D, v30.2D, v9.2D // ............*.......................... - trn2 v21.2D, v29.2D, v10.2D // ..............*........................ - trn1 v10.2D, v30.2D, v9.2D // .............*......................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - ldr q29, [x4, #32] // ................*...................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - add v15.8H, v24.8H, v21.8H // ....................*.................. - sub v4.8H, v24.8H, v21.8H // ..................*.................... - ldr q24, [x4], #(6*16) // ..*.................................... - sub v2.8H, v10.8H, v26.8H // ...................*................... - add v26.8H, v10.8H, v26.8H // .....................*................. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - sqrdmulh v31.8H, v2.8H, v1.8H // .........................*............. - mul v11.8H, v4.8H, v13.8H // ......................*................ - sqrdmulh v20.8H, v4.8H, v20.8H // .......................*............... - mul v10.8H, v2.8H, v29.8H // ........................*.............. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - add v0.8H, v26.8H, v15.8H // ...................................*... - sub v18.8H, v26.8H, v15.8H // ..........................*............ - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - mls v11.8H, v20.8H, v7.H[0] // ...........................*........... - mls v10.8H, v31.8H, v7.H[0] // ..............................*........ - mul v29.8H, v18.8H, v24.8H // ............................*.......... - sqrdmulh v3.8H, v18.8H, v28.8H // .............................*......... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - sub v25.8H, v10.8H, v11.8H // ................................*...... - add v11.8H, v10.8H, v11.8H // ....................................*.. - mls v29.8H, v3.8H, v7.H[0] // ...............................*....... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - trn1 v8.4S, v0.4S, v11.4S // ......................................* - sqrdmulh v31.8H, v25.8H, v28.8H // .................................*..... - mul v26.8H, v25.8H, v24.8H // ..................................*.... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - mls v26.8H, v31.8H, v7.H[0] // .....................................*. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... + ldr q27, [x4, #48] // ...........*............................... + ldr q15, [x1, #0] // ....*...................................... + ldr q29, [x4, #32] // ........*.................................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q24, [x1, #16] // .....*..................................... + ldr q16, [x1, #48] // ..*........................................ + ldr q21, [x1, #32] // .*......................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q5, [x3], #16 // ..........................................* + ldr q31, [x4, #64] // ......*.................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn1 v28.4S, v15.4S, v24.4S // .........*................................. + trn1 v25.4S, v21.4S, v16.4S // ..........*................................ + trn2 v19.4S, v21.4S, v16.4S // ............*.............................. + ldr q21, [x4, #80] // ...*....................................... + trn2 v26.4S, v15.4S, v24.4S // .............*............................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn1 v9.2D, v26.2D, v19.2D // ................*.......................... + trn2 v12.2D, v26.2D, v19.2D // .................*......................... + trn1 v13.2D, v28.2D, v25.2D // ..............*............................ + trn2 v2.2D, v28.2D, v25.2D // ...............*........................... + ldr q25, [x4, #16] // .......*................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q22, [x4], #(6*16) // *.......................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v1.8H, v13.8H, v9.8H // ...................*....................... + add v0.8H, v13.8H, v9.8H // ....................*...................... + sub v11.8H, v2.8H, v12.8H // ..................*........................ + add v17.8H, v2.8H, v12.8H // ...........................*............... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v21.8H, v11.8H, v21.8H // ........................*.................. + mul v19.8H, v1.8H, v29.8H // .......................*................... + mul v31.8H, v11.8H, v31.8H // ......................*.................... + sqrdmulh v28.8H, v1.8H, v27.8H // .....................*..................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v12.8H, v0.8H, v17.8H // ............................*.............. + sub v0.8H, v0.8H, v17.8H // .............................*............. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v13.8H, v0.8H, v25.8H // ................................*.......... + mls v31.8H, v21.8H, v7.H[0] // ..........................*................ + mls v19.8H, v28.8H, v7.H[0] // .........................*................. + mul v8.8H, v0.8H, v22.8H // ...............................*........... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v8.8H, v13.8H, v7.H[0] // .....................................*..... + sub v4.8H, v19.8H, v31.8H // ..............................*............ + add v13.8H, v19.8H, v31.8H // ...................................*....... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v14.8H, v4.8H, v25.8H // .................................*......... + mul v21.8H, v4.8H, v22.8H // ..................................*........ + trn2 v23.4S, v12.4S, v13.4S // .......................................*... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v21.8H, v14.8H, v7.H[0] // ....................................*...... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn2 v24.4S, v8.4S, v21.4S // .........................................*. + trn1 v0.4S, v8.4S, v21.4S // ........................................*.. + trn1 v8.4S, v12.4S, v13.4S // ......................................*.... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... // original source code - // ldr q23, [x1, #48] // ...*................................... - // ldr q3, [x1, #32] // ..*.................................... - // ldr q18, [x4], #(6*16) // ...................*................... - // ldr q1, [x1, #16] // *...................................... - // ldr q9, [x1, #0] // .*..................................... - // ldr q31, [x4, #-32] // .........*............................. - // ldr q5, [x4, #-16] // ....*.................................. - // trn2 v20.4S, v3.4S, v23.4S // ..........*............................ - // trn1 v30.4S, v3.4S, v23.4S // ...........*........................... - // ldr q13, [x4, #-80] // .....*................................. - // trn1 v4.4S, v9.4S, v1.4S // .......*............................... - // trn2 v9.4S, v9.4S, v1.4S // ........*.............................. - // trn2 v3.2D, v4.2D, v30.2D // .............*......................... - // trn1 v24.2D, v4.2D, v30.2D // ...............*....................... - // trn2 v29.2D, v9.2D, v20.2D // ..............*........................ - // trn1 v27.2D, v9.2D, v20.2D // ............*.......................... - // ldr q4, [x4, #-64] // ................*...................... - // ldr q9, [x4, #-48] // ......*................................ - // sub v25.8H, v3.8H, v29.8H // ..................*.................... - // sub v30.8H, v24.8H, v27.8H // ....................*.................. - // add v19.8H, v3.8H, v29.8H // .................*..................... - // add v11.8H, v24.8H, v27.8H // .....................*................. - // mul v10.8H, v25.8H, v31.8H // .......................*............... - // sqrdmulh v8.8H, v25.8H, v5.8H // ........................*.............. - // mul v3.8H, v30.8H, v4.8H // .........................*............. - // sqrdmulh v4.8H, v30.8H, v9.8H // ......................*................ - // sub v5.8H, v11.8H, v19.8H // ...........................*........... - // mls v10.8H, v8.8H, v7.H[0] // ............................*.......... - // mul v29.8H, v5.8H, v18.8H // ..............................*........ - // sqrdmulh v21.8H, v5.8H, v13.8H // ...............................*....... - // mls v3.8H, v4.8H, v7.H[0] // .............................*......... - // mls v29.8H, v21.8H, v7.H[0] // ..................................*.... - // sub v4.8H, v3.8H, v10.8H // ................................*...... - // sqrdmulh v2.8H, v4.8H, v13.8H // ....................................*.. - // mul v26.8H, v4.8H, v18.8H // .....................................*. - // add v0.8H, v11.8H, v19.8H // ..........................*............ - // add v11.8H, v3.8H, v10.8H // .................................*..... - // mls v26.8H, v2.8H, v7.H[0] // ......................................* - // trn1 v8.4S, v0.4S, v11.4S // ...................................*... + // ldr q13, [x4], #(6*16) // ..................*........................ + // ldr q27, [x1, #32] // .....*..................................... + // ldr q1, [x1, #48] // ....*...................................... + // ldr q4, [x4, #-16] // ...........*............................... + // ldr q17, [x1, #0] // .*......................................... + // ldr q29, [x1, #16] // ...*....................................... + // ldr q2, [x4, #-32] // .......*................................... + // ldr q21, [x4, #-80] // .................*......................... + // ldr q28, [x4, #-64] // ..*........................................ + // trn1 v3.4S, v17.4S, v29.4S // ........*.................................. + // trn1 v26.4S, v27.4S, v1.4S // .........*................................. + // ldr q25, [x4, #-48] // *.......................................... + // trn2 v20.4S, v27.4S, v1.4S // ..........*................................ + // trn2 v29.4S, v17.4S, v29.4S // ............*.............................. + // trn1 v8.2D, v3.2D, v26.2D // ...............*........................... + // trn2 v3.2D, v3.2D, v26.2D // ................*.......................... + // trn1 v1.2D, v29.2D, v20.2D // .............*............................. + // trn2 v20.2D, v29.2D, v20.2D // ..............*............................ + // sub v27.8H, v3.8H, v20.8H // .....................*..................... + // sub v12.8H, v8.8H, v1.8H // ...................*....................... + // add v18.8H, v8.8H, v1.8H // ....................*...................... + // sqrdmulh v17.8H, v12.8H, v25.8H // ..........................*................ + // mul v14.8H, v27.8H, v2.8H // .........................*................. + // mul v9.8H, v12.8H, v28.8H // ........................*.................. + // sqrdmulh v31.8H, v27.8H, v4.8H // .......................*................... + // mls v9.8H, v17.8H, v7.H[0] // ...............................*........... + // mls v14.8H, v31.8H, v7.H[0] // ..............................*............ + // add v31.8H, v3.8H, v20.8H // ......................*.................... + // add v25.8H, v18.8H, v31.8H // ...........................*............... + // sub v29.8H, v18.8H, v31.8H // ............................*.............. + // sub v28.8H, v9.8H, v14.8H // ..................................*........ + // mul v27.8H, v29.8H, v13.8H // ................................*.......... + // sqrdmulh v3.8H, v29.8H, v21.8H // .............................*............. + // sqrdmulh v26.8H, v28.8H, v21.8H // ....................................*...... + // mul v1.8H, v28.8H, v13.8H // .....................................*..... + // add v18.8H, v9.8H, v14.8H // ...................................*....... + // mls v1.8H, v26.8H, v7.H[0] // .......................................*... + // mls v27.8H, v3.8H, v7.H[0] // .................................*......... + // trn1 v8.4S, v25.4S, v18.4S // ..........................................* + // trn2 v23.4S, v25.4S, v18.4S // ......................................*.... + // trn1 v0.4S, v27.4S, v1.4S // .........................................*. + // trn2 v24.4S, v27.4S, v1.4S // ........................................*.. + // ldr q5, [x3], #16 // ......*.................................... sub count, count, #1 layer4567_start: - ldr q15, [x3], #16 // ..............................................*.............................. - trn2 v2.4S, v0.4S, v11.4S // .......................................*..................................... - trn1 v19.4S, v29.4S, v26.4S // ........................................*.................................... - ldr q23, [x1, #112] // ...e......................................................................... - // gap // ............................................................................. - // gap // ............................................................................. - ldr q3, [x1, #96] // ..e.......................................................................... - trn2 v0.4S, v29.4S, v26.4S // .........................................*................................... - ldr q18, [x4], #(6*16) // ............e................................................................ - ldr q1, [x1, #80] // .e........................................................................... - ldr q9, [x1, #64] // e............................................................................ - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - trn1 v21.2D, v2.2D, v0.2D // .............................................*............................... - trn2 v2.2D, v2.2D, v0.2D // ...........................................*................................. - trn2 v16.2D, v8.2D, v19.2D // ..........................................*.................................. - trn1 v0.2D, v8.2D, v19.2D // ............................................*................................ - ldr q31, [x4, #-32] // ................e............................................................ - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - ldr q5, [x4, #-16] // .................e........................................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - trn2 v20.4S, v3.4S, v23.4S // .......e..................................................................... - trn1 v30.4S, v3.4S, v23.4S // ......e...................................................................... - add v17.8H, v0.8H, v21.8H // ................................................*............................ - add v26.8H, v16.8H, v2.8H // .....................................................*....................... - ldr q13, [x4, #-80] // .............e............................................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - trn1 v4.4S, v9.4S, v1.4S // ....e........................................................................ - trn2 v9.4S, v9.4S, v1.4S // .....e....................................................................... - sub v21.8H, v0.8H, v21.8H // ...............................................*............................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - sub v14.8H, v16.8H, v2.8H // ....................................................*........................ - sqdmulh v2.8H, v17.8H, v7.H[1] // .........................................................*................... - sqdmulh v0.8H, v26.8H, v7.H[1] // ............................................................*................ - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - trn2 v3.2D, v4.2D, v30.2D // ........e.................................................................... - trn1 v24.2D, v4.2D, v30.2D // ..........e.................................................................. - trn2 v29.2D, v9.2D, v20.2D // .........e................................................................... - trn1 v27.2D, v9.2D, v20.2D // ...........e................................................................. - ldr q4, [x4, #-64] // ..............e.............................................................. - ldr q9, [x4, #-48] // ...............e............................................................. - // gap // ............................................................................. - // gap // ............................................................................. - sqrdmulh v16.8H, v14.8H, v15.H[5] // .......................................................*..................... - mul v20.8H, v21.8H, v15.H[2] // .................................................*........................... - sqrdmulh v21.8H, v21.8H, v15.H[3] // ..................................................*.......................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - srshr v2.8H, v2.8H, #11 // ..........................................................*.................. - srshr v0.8H, v0.8H, #11 // .............................................................*............... - sub v25.8H, v3.8H, v29.8H // .......................e..................................................... - sub v30.8H, v24.8H, v27.8H // ..................e.......................................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - mul v23.8H, v14.8H, v15.H[4] // ......................................................*...................... - add v19.8H, v3.8H, v29.8H // ........................e.................................................... - add v11.8H, v24.8H, v27.8H // ...................e......................................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - mul v10.8H, v25.8H, v31.8H // .........................e................................................... - sqrdmulh v8.8H, v25.8H, v5.8H // ..........................e.................................................. - mul v3.8H, v30.8H, v4.8H // ....................e........................................................ - sqrdmulh v4.8H, v30.8H, v9.8H // .....................e....................................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - sub v5.8H, v11.8H, v19.8H // ............................e................................................ - mls v26.8H, v0.8H, v7.H[0] // ..............................................................*.............. - mls v17.8H, v2.8H, v7.H[0] // ...........................................................*................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - mls v23.8H, v16.8H, v7.H[0] // ........................................................*.................... - mls v20.8H, v21.8H, v7.H[0] // ...................................................*......................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - mls v10.8H, v8.8H, v7.H[0] // ...........................e................................................. - mul v29.8H, v5.8H, v18.8H // ..............................e.............................................. - sqrdmulh v21.8H, v5.8H, v13.8H // ...............................e............................................. - mls v3.8H, v4.8H, v7.H[0] // ......................e...................................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - add v2.8H, v17.8H, v26.8H // ................................................................*............ - sub v16.8H, v17.8H, v26.8H // ...............................................................*............. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - sub v0.8H, v20.8H, v23.8H // ....................................................................*........ - add v9.8H, v20.8H, v23.8H // .....................................................................*....... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - mls v29.8H, v21.8H, v7.H[0] // ................................e............................................ - str q2, [x1], #(64) // .........................................................................*... - mul v21.8H, v16.8H, v15.H[0] // .................................................................*........... - sqrdmulh v16.8H, v16.8H, v15.H[1] // ..................................................................*.......... - sub v4.8H, v3.8H, v10.8H // .................................e........................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - mul v31.8H, v0.8H, v15.H[0] // ......................................................................*...... - sqrdmulh v0.8H, v0.8H, v15.H[1] // .......................................................................*..... - str q9, [x1, #-48] // ..........................................................................*.. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - sqrdmulh v2.8H, v4.8H, v13.8H // ....................................e........................................ - mul v26.8H, v4.8H, v18.8H // ...................................e......................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - mls v21.8H, v16.8H, v7.H[0] // ...................................................................*......... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - mls v31.8H, v0.8H, v7.H[0] // ........................................................................*.... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - add v0.8H, v11.8H, v19.8H // .............................e............................................... - add v11.8H, v3.8H, v10.8H // ..................................e.......................................... - mls v26.8H, v2.8H, v7.H[0] // .....................................e....................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - str q21, [x1, #-32] // ...........................................................................*. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - str q31, [x1, #-16] // ............................................................................* - trn1 v8.4S, v0.4S, v11.4S // ......................................e...................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. + ldr q13, [x4], #(6*16) // ............e...................................................................... + trn1 v18.2D, v8.2D, v0.2D // ............................................*...................................... + trn2 v11.2D, v8.2D, v0.2D // ..........................................*........................................ + ldr q27, [x1, #96] // ..e................................................................................ + ldr q1, [x1, #112] // ...e............................................................................... + // gap // ................................................................................... + trn1 v10.2D, v23.2D, v24.2D // .............................................*..................................... + trn2 v24.2D, v23.2D, v24.2D // ...........................................*....................................... + ldr q4, [x4, #-16] // .................e................................................................. + ldr q17, [x1, #64] // e.................................................................................. + ldr q29, [x1, #80] // .e................................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v19.8H, v11.8H, v24.8H // ....................................................*.............................. + add v22.8H, v11.8H, v24.8H // .....................................................*............................. + sub v15.8H, v18.8H, v10.8H // ...............................................*................................... + add v10.8H, v18.8H, v10.8H // ................................................*.................................. + ldr q2, [x4, #-32] // ................e.................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + ldr q21, [x4, #-80] // .............e..................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v23.8H, v15.8H, v5.H[3] // ..................................................*................................ + mul v6.8H, v15.8H, v5.H[2] // .................................................*................................. + sqrdmulh v9.8H, v19.8H, v5.H[5] // .......................................................*........................... + mul v24.8H, v19.8H, v5.H[4] // ......................................................*............................ + ldr q28, [x4, #-64] // ..............e.................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v3.4S, v17.4S, v29.4S // ....e.............................................................................. + trn1 v26.4S, v27.4S, v1.4S // ......e............................................................................ + ldr q25, [x4, #-48] // ...............e................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqdmulh v0.8H, v10.8H, v7.H[1] // .........................................................*......................... + trn2 v20.4S, v27.4S, v1.4S // .......e........................................................................... + trn2 v29.4S, v17.4S, v29.4S // .....e............................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v6.8H, v23.8H, v7.H[0] // ...................................................*............................... + trn1 v8.2D, v3.2D, v26.2D // ..........e........................................................................ + mls v24.8H, v9.8H, v7.H[0] // ........................................................*.......................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + srshr v16.8H, v0.8H, #11 // ..........................................................*........................ + trn2 v3.2D, v3.2D, v26.2D // ........e.......................................................................... + trn1 v1.2D, v29.2D, v20.2D // ...........e....................................................................... + trn2 v20.2D, v29.2D, v20.2D // .........e......................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqdmulh v0.8H, v22.8H, v7.H[1] // ............................................................*...................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v27.8H, v3.8H, v20.8H // .......................e........................................................... + sqdmulh v29.8H, v24.8H, v7.H[1] // ..................................................................*................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v12.8H, v8.8H, v1.8H // ..................e................................................................ + sqdmulh v30.8H, v6.8H, v7.H[1] // ...............................................................*................... + mls v10.8H, v16.8H, v7.H[0] // ...........................................................*....................... + add v18.8H, v8.8H, v1.8H // ...................e............................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v17.8H, v12.8H, v25.8H // .....................e............................................................. + mul v14.8H, v27.8H, v2.8H // .........................e......................................................... + mul v9.8H, v12.8H, v28.8H // ....................e.............................................................. + sqrdmulh v31.8H, v27.8H, v4.8H // ..........................e........................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + srshr v23.8H, v30.8H, #11 // ................................................................*.................. + srshr v26.8H, v0.8H, #11 // .............................................................*..................... + srshr v0.8H, v29.8H, #11 // ...................................................................*............... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v9.8H, v17.8H, v7.H[0] // ......................e............................................................ + mls v14.8H, v31.8H, v7.H[0] // ...........................e....................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v31.8H, v3.8H, v20.8H // ........................e.......................................................... + mls v24.8H, v0.8H, v7.H[0] // ....................................................................*.............. + mls v22.8H, v26.8H, v7.H[0] // ..............................................................*.................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v6.8H, v23.8H, v7.H[0] // .................................................................*................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v25.8H, v18.8H, v31.8H // .............................e..................................................... + sub v29.8H, v18.8H, v31.8H // ............................e...................................................... + sub v28.8H, v9.8H, v14.8H // .................................e................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v16.8H, v6.8H, v24.8H // ...........................................................................*....... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v2.8H, v10.8H, v22.8H // ......................................................................*............ + sub v10.8H, v10.8H, v22.8H // .....................................................................*............. + sub v24.8H, v6.8H, v24.8H // ..........................................................................*........ + mul v27.8H, v29.8H, v13.8H // ..............................e.................................................... + sqrdmulh v3.8H, v29.8H, v21.8H // ...............................e................................................... + sqrdmulh v26.8H, v28.8H, v21.8H // ....................................e.............................................. + mul v1.8H, v28.8H, v13.8H // ...................................e............................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q16, [x1, #16] // ................................................................................*.. + mul v22.8H, v24.8H, v5.H[0] // ............................................................................*...... + sqrdmulh v11.8H, v24.8H, v5.H[1] // .............................................................................*..... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v0.8H, v10.8H, v5.H[1] // ........................................................................*.......... + mul v10.8H, v10.8H, v5.H[0] // .......................................................................*........... + str q2, [x1], #(64) // ...............................................................................*... + add v18.8H, v9.8H, v14.8H // ..................................e................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v1.8H, v26.8H, v7.H[0] // .....................................e............................................. + mls v27.8H, v3.8H, v7.H[0] // ................................e.................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v22.8H, v11.8H, v7.H[0] // ..............................................................................*.... + mls v10.8H, v0.8H, v7.H[0] // .........................................................................*......... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v8.4S, v25.4S, v18.4S // ......................................e............................................ + trn2 v23.4S, v25.4S, v18.4S // .......................................e........................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v0.4S, v27.4S, v1.4S // ........................................e.......................................... + trn2 v24.4S, v27.4S, v1.4S // .........................................e......................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q22, [x1, #-16] // ..................................................................................* + str q10, [x1, #-32] // .................................................................................*. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + ldr q5, [x3], #16 // ..............................................e.................................... // original source code - // ldr q8, [x1, #(16*0)] // .....e....................................................................|.......e................................................................... - // ldr q9, [x1, #(16*1)] // ....e.....................................................................|......e.................................................................... - // ldr q10, [x1, #(16*2)] // .e........................................................................|...e....................................................................... - // ldr q11, [x1, #(16*3)] // e.........................................................................|..e........................................................................ - // trn1 v25.4s, v8.4s, v9.4s // .................e........................................................|...................e....................................................... - // trn2 v26.4s, v8.4s, v9.4s // ..................e.......................................................|....................e...................................................... - // trn1 v27.4s, v10.4s, v11.4s // .............e............................................................|...............e........................................................... - // trn2 v28.4s, v10.4s, v11.4s // ............e.............................................................|..............e............................................................ - // trn2 v10.2d, v25.2d, v27.2d // .......................e..................................................|.........................e................................................. - // trn2 v11.2d, v26.2d, v28.2d // .........................e................................................|...........................e............................................... - // trn1 v8.2d, v25.2d, v27.2d // ........................e.................................................|..........................e................................................ - // trn1 v9.2d, v26.2d, v28.2d // ..........................e...............................................|............................e.............................................. - // ldr q0, [x4], #(6*16) // ...e......................................................................|.....e..................................................................... - // ldr q4, [x4, #(-6*16 + 1*16)] // ................e.........................................................|..................e........................................................ - // ldr q1, [x4, #(-6*16 + 2*16)] // ...........................e..............................................|.............................e............................................. - // ldr q5, [x4, #(-6*16 + 3*16)] // ............................e.............................................|..............................e............................................ - // ldr q2, [x4, #(-6*16 + 4*16)] // ..........e...............................................................|............e.............................................................. - // ldr q6, [x4, #(-6*16 + 5*16)] // ...........e..............................................................|.............e............................................................. - // sub v24.8h, v8.8h, v9.8h // ...................................e......................................|.....................................e..................................... - // add v8.8h, v8.8h, v9.8h // ......................................e...................................|........................................e.................................. - // mul v9.8h, v24.8h, v1.8h // .........................................e................................|...........................................e............................... - // sqrdmulh v24.8h, v24.8h, v5.8h // ..........................................e...............................|............................................e.............................. - // mls v9.8h, v24.8h, v7.h[0] // ...................................................e......................|.....................................................e..................... - // sub v24.8h, v10.8h, v11.8h // ..................................e.......................................|....................................e...................................... - // add v10.8h, v10.8h, v11.8h // .....................................e....................................|.......................................e................................... - // mul v11.8h, v24.8h, v2.8h // .......................................e..................................|.........................................e................................. - // sqrdmulh v24.8h, v24.8h, v6.8h // ........................................e.................................|..........................................e................................ - // mls v11.8h, v24.8h, v7.h[0] // ................................................e.........................|..................................................e........................ - // sub v24.8h, v8.8h, v10.8h // ...........................................e..............................|.............................................e............................. - // add v8.8h, v8.8h, v10.8h // ....................................................................e.....|......................................................................e.... - // mul v10.8h, v24.8h, v0.8h // .................................................e........................|...................................................e....................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ..................................................e.......................|....................................................e...................... - // mls v10.8h, v24.8h, v7.h[0] // ........................................................e.................|..........................................................e................ - // sub v24.8h, v9.8h, v11.8h // ............................................................e.............|..............................................................e............ - // add v9.8h, v9.8h, v11.8h // .....................................................................e....|.......................................................................e... - // mul v11.8h, v24.8h, v0.8h // .................................................................e........|...................................................................e....... - // sqrdmulh v24.8h, v24.8h, v4.8h // ................................................................e.........|..................................................................e........ - // mls v11.8h, v24.8h, v7.h[0] // ......................................................................e...|........................................................................e.. - // trn1 v25.4s, v8.4s, v9.4s // .........................................................................e|........................................................................... - // trn2 v26.4s, v8.4s, v9.4s // ..........................................................................|*.......................................................................... - // trn1 v27.4s, v10.4s, v11.4s // ..........................................................................|.*......................................................................... - // trn2 v28.4s, v10.4s, v11.4s // ..*.......................................................................|....*...................................................................... - // trn2 v10.2d, v25.2d, v27.2d // ........*.................................................................|..........*................................................................ - // trn2 v11.2d, v26.2d, v28.2d // .......*..................................................................|.........*................................................................. - // trn1 v8.2d, v25.2d, v27.2d // .........*................................................................|...........*............................................................... - // trn1 v9.2d, v26.2d, v28.2d // ......*...................................................................|........*.................................................................. - // ldr q0, [x3], #16 // ..........................................................................*........................................................................... - // sub v24.8h, v8.8h, v9.8h // ...................*......................................................|.....................*..................................................... - // add v8.8h, v8.8h, v9.8h // ..............*...........................................................|................*.......................................................... - // mul v9.8h, v24.8h, v0.h[2] // ..............................*...........................................|................................*.......................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...............................*..........................................|.................................*......................................... - // mls v9.8h, v24.8h, v7.h[0] // ...............................................*..........................|.................................................*......................... - // sub v24.8h, v10.8h, v11.8h // ....................*.....................................................|......................*.................................................... - // add v10.8h, v10.8h, v11.8h // ...............*..........................................................|.................*......................................................... - // mul v11.8h, v24.8h, v0.h[4] // ....................................*.....................................|......................................*.................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // .............................*............................................|...............................*........................................... - // mls v11.8h, v24.8h, v7.h[0] // ..............................................*...........................|................................................*.......................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // .....................*....................................................|.......................*................................................... - // srshr v25.8h, v25.8h, #11 // ................................*.........................................|..................................*........................................ - // mls v8.8h, v25.8h, v7.h[0] // .............................................*............................|...............................................*........................... - // sqdmulh v25.8h, v10.8h, v7.h[1] // ......................*...................................................|........................*.................................................. - // srshr v25.8h, v25.8h, #11 // .................................*........................................|...................................*....................................... - // mls v10.8h, v25.8h, v7.h[0] // ............................................*.............................|..............................................*............................ - // sub v24.8h, v8.8h, v10.8h // .....................................................*....................|.......................................................*................... - // add v8.8h, v8.8h, v10.8h // ....................................................*.....................|......................................................*.................... - // mul v10.8h, v24.8h, v0.h[0] // ..........................................................*...............|............................................................*.............. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........................................................*..............|.............................................................*............. - // mls v10.8h, v24.8h, v7.h[0] // ..................................................................*.......|....................................................................*...... - // sub v24.8h, v9.8h, v11.8h // ......................................................*...................|........................................................*.................. - // add v9.8h, v9.8h, v11.8h // .......................................................*..................|.........................................................*................. - // mul v11.8h, v24.8h, v0.h[0] // .............................................................*............|...............................................................*........... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................................*...........|................................................................*.......... - // mls v11.8h, v24.8h, v7.h[0] // ...................................................................*......|.....................................................................*..... - // str q8, [x1], #(64) // .........................................................*................|...........................................................*............... - // str q9, [x1, #(-64 + 16*1)] // ...............................................................*..........|.................................................................*......... - // str q10, [x1, #(-64 + 16*2)] // .......................................................................*..|.........................................................................*. - // str q11, [x1, #(-64 + 16*3)] // ........................................................................*.|..........................................................................* + // ldr q8, [x1, #(16*0)] // ........e..........................................................................|.......e......................................................................... + // ldr q9, [x1, #(16*1)] // .........e.........................................................................|........e........................................................................ + // ldr q10, [x1, #(16*2)] // ...e...............................................................................|..e.............................................................................. + // ldr q11, [x1, #(16*3)] // ....e..............................................................................|...e............................................................................. + // trn1 v25.4s, v8.4s, v9.4s // .....................e.............................................................|....................e............................................................ + // trn2 v26.4s, v8.4s, v9.4s // ..........................e........................................................|.........................e....................................................... + // trn1 v27.4s, v10.4s, v11.4s // ......................e............................................................|.....................e........................................................... + // trn2 v28.4s, v10.4s, v11.4s // .........................e.........................................................|........................e........................................................ + // trn2 v10.2d, v25.2d, v27.2d // ...............................e...................................................|..............................e.................................................. + // trn2 v11.2d, v26.2d, v28.2d // .................................e.................................................|................................e................................................ + // trn1 v8.2d, v25.2d, v27.2d // ............................e......................................................|...........................e..................................................... + // trn1 v9.2d, v26.2d, v28.2d // ................................e..................................................|...............................e................................................. + // ldr q0, [x4], #(6*16) // e..................................................................................e................................................................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // ...............e...................................................................|..............e.................................................................. + // ldr q1, [x4, #(-6*16 + 2*16)] // ....................e..............................................................|...................e............................................................. + // ldr q5, [x4, #(-6*16 + 3*16)] // .......................e...........................................................|......................e.......................................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // ..............e....................................................................|.............e................................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // .......e...........................................................................|......e.......................................................................... + // sub v24.8h, v8.8h, v9.8h // .....................................e.............................................|....................................e............................................ + // add v8.8h, v8.8h, v9.8h // ........................................e..........................................|.......................................e......................................... + // mul v9.8h, v24.8h, v1.8h // ...........................................e.......................................|..........................................e...................................... + // sqrdmulh v24.8h, v24.8h, v5.8h // .........................................e.........................................|........................................e........................................ + // mls v9.8h, v24.8h, v7.h[0] // ................................................e..................................|...............................................e................................. + // sub v24.8h, v10.8h, v11.8h // ...................................e...............................................|..................................e.............................................. + // add v10.8h, v10.8h, v11.8h // ..................................................e................................|.................................................e............................... + // mul v11.8h, v24.8h, v2.8h // ..........................................e........................................|.........................................e....................................... + // sqrdmulh v24.8h, v24.8h, v6.8h // ............................................e......................................|...........................................e..................................... + // mls v11.8h, v24.8h, v7.h[0] // .................................................e.................................|................................................e................................ + // sub v24.8h, v8.8h, v10.8h // .......................................................e...........................|......................................................e.......................... + // add v8.8h, v8.8h, v10.8h // ......................................................e............................|.....................................................e........................... + // mul v10.8h, v24.8h, v0.8h // .............................................................e.....................|............................................................e.................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ..............................................................e....................|.............................................................e................... + // mls v10.8h, v24.8h, v7.h[0] // .........................................................................e.........|........................................................................e........ + // sub v24.8h, v9.8h, v11.8h // ........................................................e..........................|.......................................................e......................... + // add v9.8h, v9.8h, v11.8h // .......................................................................e...........|......................................................................e.......... + // mul v11.8h, v24.8h, v0.8h // ................................................................e..................|...............................................................e................. + // sqrdmulh v24.8h, v24.8h, v4.8h // ...............................................................e...................|..............................................................e.................. + // mls v11.8h, v24.8h, v7.h[0] // ........................................................................e..........|.......................................................................e......... + // trn1 v25.4s, v8.4s, v9.4s // ............................................................................e......|...........................................................................e..... + // trn2 v26.4s, v8.4s, v9.4s // .............................................................................e.....|............................................................................e.... + // trn1 v27.4s, v10.4s, v11.4s // ..............................................................................e....|.............................................................................e... + // trn2 v28.4s, v10.4s, v11.4s // ...............................................................................e...|..............................................................................e.. + // trn2 v10.2d, v25.2d, v27.2d // ..*................................................................................|.*............................................................................... + // trn2 v11.2d, v26.2d, v28.2d // ......*............................................................................|.....*........................................................................... + // trn1 v8.2d, v25.2d, v27.2d // .*.................................................................................|*................................................................................ + // trn1 v9.2d, v26.2d, v28.2d // .....*.............................................................................|....*............................................................................ + // ldr q0, [x3], #16 // ..................................................................................e|................................................................................. + // sub v24.8h, v8.8h, v9.8h // ............*......................................................................|...........*..................................................................... + // add v8.8h, v8.8h, v9.8h // .............*.....................................................................|............*.................................................................... + // mul v9.8h, v24.8h, v0.h[2] // .................*.................................................................|................*................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ................*..................................................................|...............*................................................................. + // mls v9.8h, v24.8h, v7.h[0] // ...........................*.......................................................|..........................*...................................................... + // sub v24.8h, v10.8h, v11.8h // ..........*........................................................................|.........*....................................................................... + // add v10.8h, v10.8h, v11.8h // ...........*.......................................................................|..........*...................................................................... + // mul v11.8h, v24.8h, v0.h[4] // ...................*...............................................................|..................*.............................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..................*................................................................|.................*............................................................... + // mls v11.8h, v24.8h, v7.h[0] // .............................*.....................................................|............................*.................................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ........................*..........................................................|.......................*......................................................... + // srshr v25.8h, v25.8h, #11 // ..............................*....................................................|.............................*................................................... + // mls v8.8h, v25.8h, v7.h[0] // .......................................*...........................................|......................................*.......................................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ..................................*................................................|.................................*............................................... + // srshr v25.8h, v25.8h, #11 // ..............................................*....................................|.............................................*................................... + // mls v10.8h, v25.8h, v7.h[0] // ....................................................*..............................|...................................................*............................. + // sqdmulh v25.8h, v9.8h, v7.h[1] // ......................................*............................................|.....................................*........................................... + // srshr v25.8h, v25.8h, #11 // .............................................*.....................................|............................................*.................................... + // mls v9.8h, v25.8h, v7.h[0] // .....................................................*.............................|....................................................*............................ + // sqdmulh v25.8h, v11.8h, v7.h[1] // ....................................*..............................................|...................................*............................................. + // srshr v25.8h, v25.8h, #11 // ...............................................*...................................|..............................................*.................................. + // mls v11.8h, v25.8h, v7.h[0] // ...................................................*...............................|..................................................*.............................. + // sub v24.8h, v8.8h, v10.8h // ...........................................................*.......................|..........................................................*...................... + // add v8.8h, v8.8h, v10.8h // ..........................................................*........................|.........................................................*....................... + // mul v10.8h, v24.8h, v0.h[0] // .....................................................................*.............|....................................................................*............ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ....................................................................*..............|...................................................................*............. + // mls v10.8h, v24.8h, v7.h[0] // ...........................................................................*.......|..........................................................................*...... + // sub v24.8h, v9.8h, v11.8h // ............................................................*......................|...........................................................*..................... + // add v9.8h, v9.8h, v11.8h // .........................................................*.........................|........................................................*........................ + // mul v11.8h, v24.8h, v0.h[0] // ..................................................................*................|.................................................................*............... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...................................................................*...............|..................................................................*.............. + // mls v11.8h, v24.8h, v7.h[0] // ..........................................................................*........|.........................................................................*....... + // str q8, [x1], #(64) // ......................................................................*............|.....................................................................*........... + // str q9, [x1, #(-64 + 16*1)] // .................................................................*.................|................................................................*................ + // str q10, [x1, #(-64 + 16*2)] // .................................................................................*.|................................................................................* + // str q11, [x1, #(-64 + 16*3)] // ................................................................................*..|...............................................................................*. sub count, count, #1 cbnz count, layer4567_start - ldr q12, [x3], #16 // *..................................... - trn2 v22.4S, v0.4S, v11.4S // .*.................................... - trn1 v20.4S, v29.4S, v26.4S // ..*................................... - trn2 v18.4S, v29.4S, v26.4S // ...*.................................. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - trn1 v14.2D, v22.2D, v18.2D // ....*................................. - trn2 v2.2D, v22.2D, v18.2D // .....*................................ - trn1 v9.2D, v8.2D, v20.2D // .......*.............................. - trn2 v1.2D, v8.2D, v20.2D // ......*............................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - add v18.8H, v9.8H, v14.8H // ........*............................. - add v28.8H, v1.8H, v2.8H // .........*............................ - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sub v17.8H, v9.8H, v14.8H // ..........*........................... - sub v20.8H, v1.8H, v2.8H // ...........*.......................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sqdmulh v5.8H, v18.8H, v7.H[1] // ............*......................... - sqdmulh v26.8H, v28.8H, v7.H[1] // .............*........................ - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sqrdmulh v15.8H, v20.8H, v12.H[5] // ..............*....................... - mul v13.8H, v17.8H, v12.H[2] // ...............*...................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mul v23.8H, v20.8H, v12.H[4] // ...................*.................. - sqrdmulh v20.8H, v17.8H, v12.H[3] // ................*..................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - srshr v25.8H, v5.8H, #11 // .................*.................... - srshr v2.8H, v26.8H, #11 // ..................*................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v23.8H, v15.8H, v7.H[0] // ......................*............... - mls v13.8H, v20.8H, v7.H[0] // .......................*.............. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v18.8H, v25.8H, v7.H[0] // .....................*................ - // gap // ...................................... - mls v28.8H, v2.8H, v7.H[0] // ....................*................. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sub v2.8H, v13.8H, v23.8H // ..........................*........... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sub v8.8H, v18.8H, v28.8H // .........................*............ - add v20.8H, v13.8H, v23.8H // ...........................*.......... - add v14.8H, v18.8H, v28.8H // ........................*............. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sqrdmulh v25.8H, v2.8H, v12.H[1] // ................................*..... - mul v2.8H, v2.8H, v12.H[0] // ...............................*...... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sqrdmulh v21.8H, v8.8H, v12.H[1] // ..............................*....... - mul v8.8H, v8.8H, v12.H[0] // .............................*........ - str q14, [x1], #(64) // ............................*......... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q20, [x1, #-48] // .................................*.... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v2.8H, v25.8H, v7.H[0] // ...................................*.. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v8.8H, v21.8H, v7.H[0] // ..................................*... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q2, [x1, #-16] // .....................................* - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q8, [x1, #-32] // ....................................*. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... + trn2 v1.2D, v8.2D, v0.2D // .*...................................... + trn1 v14.2D, v8.2D, v0.2D // *....................................... + trn2 v17.2D, v23.2D, v24.2D // ...*.................................... + trn1 v0.2D, v23.2D, v24.2D // ..*..................................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v12.8H, v1.8H, v17.8H // ....*................................... + sub v9.8H, v14.8H, v0.8H // ......*................................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + add v16.8H, v1.8H, v17.8H // .....*.................................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sqrdmulh v11.8H, v12.8H, v5.H[5] // ..........*............................. + mul v19.8H, v12.8H, v5.H[4] // ...........*............................ + sqrdmulh v25.8H, v9.8H, v5.H[3] // ........*............................... + mul v30.8H, v9.8H, v5.H[2] // .........*.............................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + add v12.8H, v14.8H, v0.8H // .......*................................ + sqdmulh v2.8H, v16.8H, v7.H[1] // ................*....................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v19.8H, v11.8H, v7.H[0] // ..............*......................... + mls v30.8H, v25.8H, v7.H[0] // .............*.......................... + sqdmulh v25.8H, v12.8H, v7.H[1] // ............*........................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + srshr v15.8H, v2.8H, #11 // .....................*.................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + srshr v31.8H, v25.8H, #11 // ...............*........................ + sqdmulh v22.8H, v19.8H, v7.H[1] // .................*...................... + sqdmulh v29.8H, v30.8H, v7.H[1] // ..................*..................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v16.8H, v15.8H, v7.H[0] // ........................*............... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v12.8H, v31.8H, v7.H[0] // ...................*.................... + srshr v25.8H, v22.8H, #11 // ......................*................. + srshr v20.8H, v29.8H, #11 // ....................*................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v19.8H, v25.8H, v7.H[0] // .......................*................ + mls v30.8H, v20.8H, v7.H[0] // .........................*.............. + add v3.8H, v12.8H, v16.8H // ...........................*............ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v2.8H, v12.8H, v16.8H // ............................*........... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + str q3, [x1], #(64) // ...................................*.... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v21.8H, v30.8H, v19.8H // .............................*.......... + add v12.8H, v30.8H, v19.8H // ..........................*............. + mul v8.8H, v2.8H, v5.H[0] // ..................................*..... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sqrdmulh v9.8H, v2.8H, v5.H[1] // .................................*...... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mul v31.8H, v21.8H, v5.H[0] // ...............................*........ + sqrdmulh v4.8H, v21.8H, v5.H[1] // ................................*....... + str q12, [x1, #-48] // ..............................*......... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v8.8H, v9.8H, v7.H[0] // .....................................*.. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v31.8H, v4.8H, v7.H[0] // ....................................*... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + str q8, [x1, #-32] // .......................................* + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + str q31, [x1, #-16] // ......................................*. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ // original source code - // ldr q15, [x3], #16 // *..................................... - // trn2 v2.4S, v0.4S, v11.4S // .*.................................... - // trn1 v19.4S, v29.4S, v26.4S // ..*................................... - // trn2 v0.4S, v29.4S, v26.4S // ...*.................................. - // trn1 v21.2D, v2.2D, v0.2D // ....*................................. - // trn2 v2.2D, v2.2D, v0.2D // .....*................................ - // trn2 v16.2D, v8.2D, v19.2D // .......*.............................. - // trn1 v0.2D, v8.2D, v19.2D // ......*............................... - // add v17.8H, v0.8H, v21.8H // ........*............................. - // add v26.8H, v16.8H, v2.8H // .........*............................ - // sub v21.8H, v0.8H, v21.8H // ..........*........................... - // sub v14.8H, v16.8H, v2.8H // ...........*.......................... - // sqdmulh v2.8H, v17.8H, v7.H[1] // ............*......................... - // sqdmulh v0.8H, v26.8H, v7.H[1] // .............*........................ - // sqrdmulh v16.8H, v14.8H, v15.H[5] // ..............*....................... - // mul v20.8H, v21.8H, v15.H[2] // ...............*...................... - // sqrdmulh v21.8H, v21.8H, v15.H[3] // .................*.................... - // srshr v2.8H, v2.8H, #11 // ..................*................... - // srshr v0.8H, v0.8H, #11 // ...................*.................. - // mul v23.8H, v14.8H, v15.H[4] // ................*..................... - // mls v26.8H, v0.8H, v7.H[0] // .......................*.............. - // mls v17.8H, v2.8H, v7.H[0] // ......................*............... - // mls v23.8H, v16.8H, v7.H[0] // ....................*................. - // mls v20.8H, v21.8H, v7.H[0] // .....................*................ - // add v2.8H, v17.8H, v26.8H // ...........................*.......... - // sub v16.8H, v17.8H, v26.8H // .........................*............ - // sub v0.8H, v20.8H, v23.8H // ........................*............. - // add v9.8H, v20.8H, v23.8H // ..........................*........... - // str q2, [x1], #(64) // ................................*..... - // mul v21.8H, v16.8H, v15.H[0] // ...............................*...... - // sqrdmulh v16.8H, v16.8H, v15.H[1] // ..............................*....... - // mul v31.8H, v0.8H, v15.H[0] // .............................*........ - // sqrdmulh v0.8H, v0.8H, v15.H[1] // ............................*......... - // str q9, [x1, #-48] // .................................*.... - // mls v21.8H, v16.8H, v7.H[0] // ...................................*.. - // mls v31.8H, v0.8H, v7.H[0] // ..................................*... - // str q21, [x1, #-32] // .....................................* - // str q31, [x1, #-16] // ....................................*. + // trn1 v18.2D, v8.2D, v0.2D // .*...................................... + // trn2 v11.2D, v8.2D, v0.2D // *....................................... + // trn1 v10.2D, v23.2D, v24.2D // ...*.................................... + // trn2 v24.2D, v23.2D, v24.2D // ..*..................................... + // sub v19.8H, v11.8H, v24.8H // ....*................................... + // add v22.8H, v11.8H, v24.8H // ......*................................. + // sub v15.8H, v18.8H, v10.8H // .....*.................................. + // add v10.8H, v18.8H, v10.8H // ...........*............................ + // sqrdmulh v23.8H, v15.8H, v5.H[3] // .........*.............................. + // mul v6.8H, v15.8H, v5.H[2] // ..........*............................. + // sqrdmulh v9.8H, v19.8H, v5.H[5] // .......*................................ + // mul v24.8H, v19.8H, v5.H[4] // ........*............................... + // sqdmulh v0.8H, v10.8H, v7.H[1] // ...............*........................ + // mls v6.8H, v23.8H, v7.H[0] // ..............*......................... + // mls v24.8H, v9.8H, v7.H[0] // .............*.......................... + // srshr v16.8H, v0.8H, #11 // .................*...................... + // sqdmulh v0.8H, v22.8H, v7.H[1] // ............*........................... + // sqdmulh v29.8H, v24.8H, v7.H[1] // ..................*..................... + // sqdmulh v30.8H, v6.8H, v7.H[1] // ...................*.................... + // mls v10.8H, v16.8H, v7.H[0] // .....................*.................. + // srshr v23.8H, v30.8H, #11 // .......................*................ + // srshr v26.8H, v0.8H, #11 // ................*....................... + // srshr v0.8H, v29.8H, #11 // ......................*................. + // mls v24.8H, v0.8H, v7.H[0] // ........................*............... + // mls v22.8H, v26.8H, v7.H[0] // ....................*................... + // mls v6.8H, v23.8H, v7.H[0] // .........................*.............. + // add v16.8H, v6.8H, v24.8H // ..............................*......... + // add v2.8H, v10.8H, v22.8H // ..........................*............. + // sub v10.8H, v10.8H, v22.8H // ...........................*............ + // sub v24.8H, v6.8H, v24.8H // .............................*.......... + // str q16, [x1, #16] // ...................................*.... + // mul v22.8H, v24.8H, v5.H[0] // .................................*...... + // sqrdmulh v11.8H, v24.8H, v5.H[1] // ..................................*..... + // sqrdmulh v0.8H, v10.8H, v5.H[1] // ................................*....... + // mul v10.8H, v10.8H, v5.H[0] // ...............................*........ + // str q2, [x1], #(64) // ............................*........... + // mls v22.8H, v11.8H, v7.H[0] // .....................................*.. + // mls v10.8H, v0.8H, v7.H[0] // ....................................*... + // str q22, [x1, #-16] // .......................................* + // str q10, [x1, #-32] // ......................................*. // --------------------------------------------------------------------- @@ -1114,614 +1206,570 @@ layer4567_start: .p2align 2 - ldr q16, [x0, #192] // ..*........................................................ - ldr q2, [x0, #128] // .*......................................................... - ldr q23, [x0, #448] // *.......................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - ldr q21, [x0, #384] // ...*....................................................... - ldr q26, [x0, #320] // ....*...................................................... - ldr q20, [x0, #256] // .....*..................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - ldr q17, [x0, #64] // ......*.................................................... - ldr q4, [x0, #0] // .......*................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - sub v25.8H, v2.8H, v16.8H // ........*.................................................. - add v2.8H, v2.8H, v16.8H // ..........*................................................ - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - sub v16.8H, v21.8H, v23.8H // ...........*............................................... - add v23.8H, v21.8H, v23.8H // ............*.............................................. - add v21.8H, v20.8H, v26.8H // .........*................................................. - sub v26.8H, v20.8H, v26.8H // ...............*........................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - add v20.8H, v4.8H, v17.8H // ...................*....................................... - sub v17.8H, v4.8H, v17.8H // .......................*................................... - sqrdmulh v4.8H, v25.8H, v1.H[1] // .............*............................................. - mul v25.8H, v25.8H, v1.H[0] // ..............*............................................ - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - sub v13.8H, v21.8H, v23.8H // ....................*...................................... - sqrdmulh v11.8H, v16.8H, v1.H[5] // ................*.......................................... - mul v16.8H, v16.8H, v1.H[4] // .................*......................................... - add v6.8H, v21.8H, v23.8H // ..................*........................................ - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - sub v23.8H, v20.8H, v2.8H // ..........................*................................ - add v28.8H, v20.8H, v2.8H // .........................*................................. - mul v2.8H, v26.8H, v1.H[2] // .....................*..................................... - sqrdmulh v21.8H, v26.8H, v1.H[3] // ......................*.................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - sqrdmulh v26.8H, v17.8H, v0.H[7] // ............................*.............................. - mul v20.8H, v17.8H, v0.H[6] // .............................*............................. - sqdmulh v17.8H, v6.8H, v7.H[1] // ........................*.................................. - mul v3.8H, v13.8H, v0.H[4] // ...........................*............................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - sqrdmulh v13.8H, v13.8H, v0.H[5] // .................................*......................... - sqrdmulh v10.8H, v23.8H, v0.H[3] // ..............................*............................ - mul v23.8H, v23.8H, v0.H[2] // ...............................*........................... - sqdmulh v19.8H, v28.8H, v7.H[1] // ...................................*....................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - mls v25.8H, v4.8H, v7.H[0] // .....................................*..................... - mls v16.8H, v11.8H, v7.H[0] // ....................................*...................... - mls v2.8H, v21.8H, v7.H[0] // ................................*.......................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - mls v20.8H, v26.8H, v7.H[0] // ......................................*.................... - srshr v21.8H, v17.8H, #11 // ..................................*........................ - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - mls v3.8H, v13.8H, v7.H[0] // .......................................*................... - mls v23.8H, v10.8H, v7.H[0] // ........................................*.................. - srshr v26.8H, v19.8H, #11 // ..........................................*................ - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - sub v17.8H, v2.8H, v16.8H // ...........................................*............... - add v19.8H, v2.8H, v16.8H // .............................................*............. - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - mls v6.8H, v21.8H, v7.H[0] // .........................................*................. - sub v2.8H, v20.8H, v25.8H // ..............................................*............ - add v27.8H, v20.8H, v25.8H // ............................................*.............. - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - sub v16.8H, v23.8H, v3.8H // .................................................*......... - add v23.8H, v23.8H, v3.8H // ..................................................*........ - mul v25.8H, v17.8H, v0.H[4] // ...............................................*........... - sqrdmulh v3.8H, v17.8H, v0.H[5] // ................................................*.......... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - mls v28.8H, v26.8H, v7.H[0] // ....................................................*...... - sqrdmulh v4.8H, v2.8H, v0.H[3] // .....................................................*..... - mul v21.8H, v2.8H, v0.H[2] // ......................................................*.... - sub v13.8H, v27.8H, v19.8H // ...................................................*....... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - mul v20.8H, v16.8H, v0.H[0] // .......................................................*... - sqrdmulh v16.8H, v16.8H, v0.H[1] // ........................................................*.. - mul v2.8H, v23.8H, v29.8H // .........................................................*. - sqrdmulh v17.8H, v23.8H, v30.8H // ..........................................................* - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... + ldr q23, [x0, #128] // ..*................................................. + ldr q19, [x0, #192] // .*.................................................. + ldr q22, [x0, #0] // *................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + ldr q28, [x0, #64] // ....*............................................... + ldr q27, [x0, #384] // ...*................................................ + ldr q24, [x0, #448] // .....*.............................................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + ldr q3, [x0, #320] // ......*............................................. + ldr q26, [x0, #256] // .......*............................................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v20.8H, v23.8H, v19.8H // .........*.......................................... + add v19.8H, v23.8H, v19.8H // ........*........................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v23.8H, v22.8H, v28.8H // ..........*......................................... + add v22.8H, v22.8H, v28.8H // ...........*........................................ + sub v28.8H, v27.8H, v24.8H // .............*...................................... + add v27.8H, v27.8H, v24.8H // ............*....................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v24.8H, v26.8H, v3.8H // ...............*.................................... + add v3.8H, v26.8H, v3.8H // .................*.................................. + mul v26.8H, v20.8H, v1.H[0] // ..............*..................................... + sqrdmulh v20.8H, v20.8H, v1.H[1] // ...................*................................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v11.8H, v23.8H, v0.H[7] // ....................*............................... + mul v23.8H, v23.8H, v0.H[6] // .....................*.............................. + sqrdmulh v14.8H, v28.8H, v1.H[5] // ................*................................... + mul v28.8H, v28.8H, v1.H[4] // ..................*................................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v25.8H, v24.8H, v1.H[3] // ......................*............................. + mul v24.8H, v24.8H, v1.H[2] // .......................*............................ + sub v5.8H, v3.8H, v27.8H // ........................*........................... + sub v10.8H, v22.8H, v19.8H // .........................*.......................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + add v19.8H, v22.8H, v19.8H // ............................*....................... + add v22.8H, v3.8H, v27.8H // .............................*...................... + mls v26.8H, v20.8H, v7.H[0] // ...........................*........................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v23.8H, v11.8H, v7.H[0] // ...............................*.................... + mls v28.8H, v14.8H, v7.H[0] // ..........................*......................... + sqrdmulh v3.8H, v5.8H, v0.H[5] // ................................*................... + sqrdmulh v20.8H, v10.8H, v0.H[3] // .................................*.................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v24.8H, v25.8H, v7.H[0] // ..............................*..................... + mul v13.8H, v5.8H, v0.H[4] // ...................................*................ + mul v16.8H, v10.8H, v0.H[2] // ..................................*................. + add v27.8H, v19.8H, v22.8H // ....................................*............... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v25.8H, v19.8H, v22.8H // ...........................................*........ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v11.8H, v23.8H, v26.8H // ......................................*............. + add v15.8H, v23.8H, v26.8H // ........................................*........... + sqrdmulh v22.8H, v27.8H, v30.8H // .................................................*.. + mul v19.8H, v27.8H, v29.8H // ..................................................*. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v23.8H, v24.8H, v28.8H // .......................................*............ + add v27.8H, v24.8H, v28.8H // .....................................*.............. + mls v13.8H, v3.8H, v7.H[0] // .........................................*.......... + mls v16.8H, v20.8H, v7.H[0] // ..........................................*......... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mul v4.8H, v11.8H, v0.H[2] // .............................................*...... + sqrdmulh v20.8H, v11.8H, v0.H[3] // ..............................................*..... + mul v5.8H, v25.8H, v0.H[0] // ................................................*... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mul v10.8H, v23.8H, v0.H[4] // ............................................*....... + sqrdmulh v3.8H, v23.8H, v0.H[5] // ...............................................*.... + add v23.8H, v15.8H, v27.8H // ...................................................* + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... // original source code - // ldr q18, [x0, #448] // ..*........................................................ - // ldr q5, [x0, #128] // .*......................................................... - // ldr q31, [x0, #192] // *.......................................................... - // ldr q19, [x0, #384] // ...*....................................................... - // ldr q6, [x0, #320] // ....*...................................................... - // ldr q24, [x0, #256] // .....*..................................................... - // ldr q11, [x0, #64] // ......*.................................................... - // ldr q12, [x0, #0] // .......*................................................... - // sub v8.8H, v5.8H, v31.8H // ........*.................................................. - // add v28.8H, v24.8H, v6.8H // ............*.............................................. - // add v22.8H, v5.8H, v31.8H // .........*................................................. - // sub v13.8H, v19.8H, v18.8H // ..........*................................................ - // add v23.8H, v19.8H, v18.8H // ...........*............................................... - // sqrdmulh v19.8H, v8.8H, v1.H[1] // ................*.......................................... - // mul v8.8H, v8.8H, v1.H[0] // .................*......................................... - // sub v27.8H, v24.8H, v6.8H // .............*............................................. - // sqrdmulh v20.8H, v13.8H, v1.H[5] // ...................*....................................... - // mul v5.8H, v13.8H, v1.H[4] // ....................*...................................... - // add v6.8H, v28.8H, v23.8H // .....................*..................................... - // add v24.8H, v12.8H, v11.8H // ..............*............................................ - // sub v18.8H, v28.8H, v23.8H // ..................*........................................ - // mul v9.8H, v27.8H, v1.H[2] // ........................*.................................. - // sqrdmulh v15.8H, v27.8H, v1.H[3] // .........................*................................. - // sub v11.8H, v12.8H, v11.8H // ...............*........................................... - // sqdmulh v31.8H, v6.8H, v7.H[1] // ............................*.............................. - // add v28.8H, v24.8H, v22.8H // .......................*................................... - // sub v22.8H, v24.8H, v22.8H // ......................*.................................... - // mul v13.8H, v18.8H, v0.H[4] // .............................*............................. - // sqrdmulh v25.8H, v11.8H, v0.H[7] // ..........................*................................ - // mul v12.8H, v11.8H, v0.H[6] // ...........................*............................... - // sqrdmulh v10.8H, v22.8H, v0.H[3] // ...............................*........................... - // mul v24.8H, v22.8H, v0.H[2] // ................................*.......................... - // mls v9.8H, v15.8H, v7.H[0] // ....................................*...................... - // sqrdmulh v21.8H, v18.8H, v0.H[5] // ..............................*............................ - // srshr v23.8H, v31.8H, #11 // ......................................*.................... - // sqdmulh v18.8H, v28.8H, v7.H[1] // .................................*......................... - // mls v5.8H, v20.8H, v7.H[0] // ...................................*....................... - // mls v8.8H, v19.8H, v7.H[0] // ..................................*........................ - // mls v12.8H, v25.8H, v7.H[0] // .....................................*..................... - // mls v13.8H, v21.8H, v7.H[0] // .......................................*................... - // mls v24.8H, v10.8H, v7.H[0] // ........................................*.................. - // mls v6.8H, v23.8H, v7.H[0] // ............................................*.............. - // srshr v21.8H, v18.8H, #11 // .........................................*................. - // sub v20.8H, v9.8H, v5.8H // ..........................................*................ - // add v27.8H, v12.8H, v8.8H // ..............................................*............ - // add v19.8H, v9.8H, v5.8H // ...........................................*............... - // sub v26.8H, v12.8H, v8.8H // .............................................*............. - // mul v25.8H, v20.8H, v0.H[4] // .................................................*......... - // sqrdmulh v3.8H, v20.8H, v0.H[5] // ..................................................*........ - // sub v2.8H, v24.8H, v13.8H // ...............................................*........... - // add v17.8H, v24.8H, v13.8H // ................................................*.......... - // sub v13.8H, v27.8H, v19.8H // ......................................................*.... - // mls v28.8H, v21.8H, v7.H[0] // ...................................................*....... - // sqrdmulh v4.8H, v26.8H, v0.H[3] // ....................................................*...... - // mul v21.8H, v26.8H, v0.H[2] // .....................................................*..... - // mul v20.8H, v2.8H, v0.H[0] // .......................................................*... - // sqrdmulh v16.8H, v2.8H, v0.H[1] // ........................................................*.. - // mul v2.8H, v17.8H, v29.8H // .........................................................*. - // sqrdmulh v17.8H, v17.8H, v30.8H // ..........................................................* + // ldr q21, [x0, #0] // ..*................................................. + // ldr q9, [x0, #192] // .*.................................................. + // ldr q16, [x0, #128] // *................................................... + // ldr q15, [x0, #384] // ....*............................................... + // ldr q8, [x0, #64] // ...*................................................ + // ldr q25, [x0, #448] // .....*.............................................. + // ldr q18, [x0, #320] // ......*............................................. + // ldr q6, [x0, #256] // .......*............................................ + // add v13.8H, v16.8H, v9.8H // .........*.......................................... + // sub v16.8H, v16.8H, v9.8H // ........*........................................... + // sub v2.8H, v21.8H, v8.8H // ..........*......................................... + // add v17.8H, v21.8H, v8.8H // ...........*........................................ + // add v11.8H, v15.8H, v25.8H // .............*...................................... + // sub v27.8H, v15.8H, v25.8H // ............*....................................... + // mul v15.8H, v16.8H, v1.H[0] // ................*................................... + // sub v9.8H, v6.8H, v18.8H // ..............*..................................... + // sqrdmulh v28.8H, v27.8H, v1.H[5] // ....................*............................... + // add v18.8H, v6.8H, v18.8H // ...............*.................................... + // mul v6.8H, v27.8H, v1.H[4] // .....................*.............................. + // sqrdmulh v5.8H, v16.8H, v1.H[1] // .................*.................................. + // sqrdmulh v16.8H, v2.8H, v0.H[7] // ..................*................................. + // mul v23.8H, v2.8H, v0.H[6] // ...................*................................ + // sqrdmulh v27.8H, v9.8H, v1.H[3] // ......................*............................. + // mul v2.8H, v9.8H, v1.H[2] // .......................*............................ + // sub v25.8H, v18.8H, v11.8H // ........................*........................... + // sub v9.8H, v17.8H, v13.8H // .........................*.......................... + // mls v6.8H, v28.8H, v7.H[0] // ..............................*..................... + // mls v15.8H, v5.8H, v7.H[0] // ............................*....................... + // add v5.8H, v17.8H, v13.8H // ..........................*......................... + // add v4.8H, v18.8H, v11.8H // ...........................*........................ + // mls v2.8H, v27.8H, v7.H[0] // .................................*.................. + // mls v23.8H, v16.8H, v7.H[0] // .............................*...................... + // sqrdmulh v11.8H, v25.8H, v0.H[5] // ...............................*.................... + // sqrdmulh v19.8H, v9.8H, v0.H[3] // ................................*................... + // mul v16.8H, v9.8H, v0.H[2] // ...................................*................ + // mul v13.8H, v25.8H, v0.H[4] // ..................................*................. + // add v24.8H, v5.8H, v4.8H // ....................................*............... + // add v27.8H, v2.8H, v6.8H // ...........................................*........ + // sub v14.8H, v23.8H, v15.8H // ......................................*............. + // sub v28.8H, v2.8H, v6.8H // ..........................................*......... + // add v15.8H, v23.8H, v15.8H // .......................................*............ + // mls v13.8H, v11.8H, v7.H[0] // ............................................*....... + // mls v16.8H, v19.8H, v7.H[0] // .............................................*...... + // sub v25.8H, v5.8H, v4.8H // .....................................*.............. + // mul v10.8H, v28.8H, v0.H[4] // .................................................*.. + // mul v4.8H, v14.8H, v0.H[2] // ..............................................*..... + // sqrdmulh v20.8H, v14.8H, v0.H[3] // ...............................................*.... + // sqrdmulh v3.8H, v28.8H, v0.H[5] // ..................................................*. + // mul v5.8H, v25.8H, v0.H[0] // ................................................*... + // sqrdmulh v22.8H, v24.8H, v30.8H // ........................................*........... + // mul v19.8H, v24.8H, v29.8H // .........................................*.......... + // add v23.8H, v15.8H, v27.8H // ...................................................* sub count, count, #1 layer123_start: - ldr q18, [x0, #464] // .......e...................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - add v26.8H, v27.8H, v19.8H // ............................................................*................................. - sqrdmulh v23.8H, v13.8H, v0.H[1] // ..............................................................*............................... - ldr q5, [x0, #144] // ..e........................................................................................... - ldr q31, [x0, #208] // ...e.......................................................................................... - mul v27.8H, v13.8H, v0.H[0] // .............................................................*................................ - add v10.8H, v28.8H, v6.8H // .......................................................*...................................... - sub v13.8H, v28.8H, v6.8H // ......................................................*....................................... - ldr q19, [x0, #400] // ......e....................................................................................... - ldr q6, [x0, #336] // .....e........................................................................................ - // gap // .............................................................................................. - ldr q24, [x0, #272] // ....e......................................................................................... - mls v25.8H, v3.8H, v7.H[0] // ...............................................*.............................................. - mls v21.8H, v4.8H, v7.H[0] // .....................................*........................................................ - mul v14.8H, v26.8H, v29.8H // .................................................................................*............ - sqrdmulh v3.8H, v26.8H, v30.8H // ..................................................................................*........... - ldr q11, [x0, #80] // .e............................................................................................ - ldr q12, [x0, #16] // e............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v20.8H, v16.8H, v7.H[0] // ....................................................................*......................... - mls v2.8H, v17.8H, v7.H[0] // ......................................................................................*....... - mul v4.8H, v10.8H, v29.8H // ..............................................................................*............... - // gap // .............................................................................................. - mul v26.8H, v13.8H, v0.H[0] // ........................................................*..................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v27.8H, v23.8H, v7.H[0] // ...............................................................*.............................. - sqrdmulh v17.8H, v10.8H, v30.8H // ...............................................................................*.............. - sub v8.8H, v5.8H, v31.8H // .............e................................................................................ - sqrdmulh v10.8H, v13.8H, v0.H[1] // .........................................................*.................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v16.8H, v21.8H, v25.8H // ......................................................................*....................... - str q20, [x0, #384] // ............................................................................*................. - str q2, [x0, #128] // ............................................................................................*. - add v28.8H, v24.8H, v6.8H // ...................e.......................................................................... - add v22.8H, v5.8H, v31.8H // ..............e............................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v13.8H, v19.8H, v18.8H // .......................e...................................................................... - add v23.8H, v19.8H, v18.8H // ........................e..................................................................... - sqrdmulh v19.8H, v8.8H, v1.H[1] // ................e............................................................................. - mul v8.8H, v8.8H, v1.H[0] // ...............e.............................................................................. - // gap // .............................................................................................. - str q27, [x0, #320] // ...........................................................................*.................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v27.8H, v24.8H, v6.8H // ..................e........................................................................... - sqrdmulh v2.8H, v16.8H, v30.8H // ........................................................................................*..... - sqrdmulh v20.8H, v13.8H, v1.H[5] // ..........................e................................................................... - mul v5.8H, v13.8H, v1.H[4] // .........................e.................................................................... - add v6.8H, v28.8H, v23.8H // .......................................e...................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v24.8H, v12.8H, v11.8H // .........e.................................................................................... - // gap // .............................................................................................. - sub v18.8H, v28.8H, v23.8H // ......................................e....................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v9.8H, v27.8H, v1.H[2] // ....................e......................................................................... - sqrdmulh v15.8H, v27.8H, v1.H[3] // .....................e........................................................................ - sub v11.8H, v12.8H, v11.8H // ........e..................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v26.8H, v10.8H, v7.H[0] // ..........................................................*................................... - sqdmulh v31.8H, v6.8H, v7.H[1] // ...................................................e.......................................... - add v28.8H, v24.8H, v22.8H // .............................e................................................................ - sub v22.8H, v24.8H, v22.8H // ............................e................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v27.8H, v21.8H, v25.8H // .....................................................................*........................ - mul v13.8H, v18.8H, v0.H[4] // ........................................e..................................................... - sqrdmulh v25.8H, v11.8H, v0.H[7] // ...........e.................................................................................. - mul v12.8H, v11.8H, v0.H[6] // ..........e................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v10.8H, v22.8H, v0.H[3] // ...............................e.............................................................. - mul v24.8H, v22.8H, v0.H[2] // ..............................e............................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v9.8H, v15.8H, v7.H[0] // ......................e....................................................................... - sqrdmulh v21.8H, v18.8H, v0.H[5] // .........................................e.................................................... - srshr v23.8H, v31.8H, #11 // ....................................................e......................................... - mul v16.8H, v16.8H, v29.8H // .......................................................................................*...... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqdmulh v18.8H, v28.8H, v7.H[1] // ................................................e............................................. - mls v5.8H, v20.8H, v7.H[0] // ...........................e.................................................................. - str q26, [x0, #256] // ..........................................................................*................... - sqrdmulh v26.8H, v27.8H, v0.H[1] // ........................................................................*..................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v8.8H, v19.8H, v7.H[0] // .................e............................................................................ - mul v11.8H, v27.8H, v0.H[0] // .......................................................................*...................... - mls v12.8H, v25.8H, v7.H[0] // ............e................................................................................. - mls v4.8H, v17.8H, v7.H[0] // ................................................................................*............. - // gap // .............................................................................................. - mls v13.8H, v21.8H, v7.H[0] // ..........................................e................................................... - mls v24.8H, v10.8H, v7.H[0] // ................................e............................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v14.8H, v3.8H, v7.H[0] // ...................................................................................*.......... - mls v6.8H, v23.8H, v7.H[0] // .....................................................e........................................ - mls v16.8H, v2.8H, v7.H[0] // .........................................................................................*.... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - srshr v21.8H, v18.8H, #11 // .................................................e............................................ - sub v20.8H, v9.8H, v5.8H // ...........................................e.................................................. - add v27.8H, v12.8H, v8.8H // ..................................e........................................................... - // gap // .............................................................................................. - mls v11.8H, v26.8H, v7.H[0] // .........................................................................*.................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v19.8H, v9.8H, v5.8H // ............................................e................................................. - sub v26.8H, v12.8H, v8.8H // .................................e............................................................ - mul v25.8H, v20.8H, v0.H[4] // .............................................e................................................ - sqrdmulh v3.8H, v20.8H, v0.H[5] // ..............................................e............................................... - sub v2.8H, v24.8H, v13.8H // ................................................................e............................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v17.8H, v24.8H, v13.8H // .................................................................e............................ - str q16, [x0, #192] // .............................................................................................* - sub v13.8H, v27.8H, v19.8H // ...........................................................e.................................. - str q4, [x0], #(16) // ..........................................................................................*... - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v28.8H, v21.8H, v7.H[0] // ..................................................e........................................... - sqrdmulh v4.8H, v26.8H, v0.H[3] // ....................................e......................................................... - mul v21.8H, v26.8H, v0.H[2] // ...................................e.......................................................... - mul v20.8H, v2.8H, v0.H[0] // ..................................................................e........................... - // gap // .............................................................................................. - sqrdmulh v16.8H, v2.8H, v0.H[1] // ...................................................................e.......................... - mul v2.8H, v17.8H, v29.8H // ....................................................................................e......... - sqrdmulh v17.8H, v17.8H, v30.8H // .....................................................................................e........ - // gap // .............................................................................................. - str q11, [x0, #432] // .............................................................................*................ - str q14, [x0, #48] // ...........................................................................................*.. + sub v26.8H, v16.8H, v13.8H // ..........................................................*............................. + // gap // ........................................................................................ + ldr q21, [x0, #16] // e....................................................................................... + ldr q9, [x0, #208] // ...e.................................................................................... + sqrdmulh v11.8H, v25.8H, v0.H[1] // ...................................................*.................................... + sub v27.8H, v15.8H, v27.8H // .....................................................*.................................. + add v28.8H, v16.8H, v13.8H // ...........................................................*............................ + ldr q16, [x0, #144] // ..e..................................................................................... + ldr q15, [x0, #400] // ......e................................................................................. + // gap // ........................................................................................ + ldr q8, [x0, #80] // .e...................................................................................... + ldr q25, [x0, #464] // .......e................................................................................ + mls v4.8H, v20.8H, v7.H[0] // .....................................*.................................................. + mls v10.8H, v3.8H, v7.H[0] // ...............................................*........................................ + mul v31.8H, v23.8H, v29.8H // ...........................................................................*............ + sqrdmulh v17.8H, v23.8H, v30.8H // ............................................................................*........... + sqrdmulh v14.8H, v26.8H, v0.H[1] // .............................................................*.......................... + mul v12.8H, v26.8H, v0.H[0] // ............................................................*........................... + mul v26.8H, v27.8H, v0.H[0] // .......................................................*................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q18, [x0, #336] // .....e.................................................................................. + ldr q6, [x0, #272] // ....e................................................................................... + sqrdmulh v20.8H, v27.8H, v0.H[1] // ........................................................*............................... + // gap // ........................................................................................ + mls v19.8H, v22.8H, v7.H[0] // ..........................................................................*............. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v3.8H, v28.8H, v29.8H // ..............................................................................*......... + sqrdmulh v24.8H, v28.8H, v30.8H // ...............................................................................*........ + mls v5.8H, v11.8H, v7.H[0] // ....................................................*................................... + add v13.8H, v16.8H, v9.8H // ..............e......................................................................... + mls v31.8H, v17.8H, v7.H[0] // .............................................................................*.......... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v16.8H, v16.8H, v9.8H // .............e.......................................................................... + add v23.8H, v4.8H, v10.8H // ................................................................*....................... + sub v2.8H, v21.8H, v8.8H // ........e............................................................................... + add v17.8H, v21.8H, v8.8H // .........e.............................................................................. + add v11.8H, v15.8H, v25.8H // ........................e............................................................... + // gap // ........................................................................................ + sub v27.8H, v15.8H, v25.8H // .......................e................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v15.8H, v16.8H, v1.H[0] // ...............e........................................................................ + sub v9.8H, v6.8H, v18.8H // ..................e..................................................................... + str q19, [x0], #(16) // ....................................................................................*... + // gap // ........................................................................................ + // gap // ........................................................................................ + str q5, [x0, #240] // ....................................................................*................... + mul v22.8H, v23.8H, v29.8H // .................................................................................*...... + sqrdmulh v25.8H, v23.8H, v30.8H // ..................................................................................*..... + sqrdmulh v28.8H, v27.8H, v1.H[5] // ..........................e............................................................. + add v18.8H, v6.8H, v18.8H // ...................e.................................................................... + mul v6.8H, v27.8H, v1.H[4] // .........................e.............................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v5.8H, v16.8H, v1.H[1] // ................e....................................................................... + sqrdmulh v16.8H, v2.8H, v0.H[7] // ...........e............................................................................ + mul v23.8H, v2.8H, v0.H[6] // ..........e............................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v27.8H, v9.8H, v1.H[3] // .....................e.................................................................. + mul v2.8H, v9.8H, v1.H[2] // ....................e................................................................... + mls v22.8H, v25.8H, v7.H[0] // ...................................................................................*.... + sub v25.8H, v18.8H, v11.8H // ......................................e................................................. + sub v9.8H, v17.8H, v13.8H // ............................e........................................................... + sub v19.8H, v4.8H, v10.8H // ...............................................................*........................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v6.8H, v28.8H, v7.H[0] // ...........................e............................................................ + mls v15.8H, v5.8H, v7.H[0] // .................e...................................................................... + add v5.8H, v17.8H, v13.8H // .............................e.......................................................... + add v4.8H, v18.8H, v11.8H // .......................................e................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v28.8H, v19.8H, v0.H[1] // ..................................................................*..................... + mul v18.8H, v19.8H, v0.H[0] // .................................................................*...................... + mls v2.8H, v27.8H, v7.H[0] // ......................e................................................................. + mls v23.8H, v16.8H, v7.H[0] // ............e........................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v11.8H, v25.8H, v0.H[5] // .........................................e.............................................. + sqrdmulh v19.8H, v9.8H, v0.H[3] // ...............................e........................................................ + mul v16.8H, v9.8H, v0.H[2] // ..............................e......................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v13.8H, v25.8H, v0.H[4] // ........................................e............................................... + mls v3.8H, v24.8H, v7.H[0] // ................................................................................*....... + mls v12.8H, v14.8H, v7.H[0] // ..............................................................*......................... + mls v26.8H, v20.8H, v7.H[0] // .........................................................*.............................. + add v24.8H, v5.8H, v4.8H // .................................................e...................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v27.8H, v2.8H, v6.8H // ............................................e........................................... + mls v18.8H, v28.8H, v7.H[0] // ...................................................................*.................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q22, [x0, #176] // .......................................................................................* + sub v14.8H, v23.8H, v15.8H // .................................e...................................................... + sub v28.8H, v2.8H, v6.8H // ...........................................e............................................ + add v15.8H, v23.8H, v15.8H // ..................................e..................................................... + mls v13.8H, v11.8H, v7.H[0] // ..........................................e............................................. + mls v16.8H, v19.8H, v7.H[0] // ................................e....................................................... + sub v25.8H, v5.8H, v4.8H // ................................................e....................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q31, [x0, #48] // .....................................................................................*.. + // gap // ........................................................................................ + str q3, [x0, #112] // ......................................................................................*. + str q12, [x0, #368] // ......................................................................*................. + mul v10.8H, v28.8H, v0.H[4] // .............................................e.......................................... + // gap // ........................................................................................ + mul v4.8H, v14.8H, v0.H[2] // ...................................e.................................................... + sqrdmulh v20.8H, v14.8H, v0.H[3] // ....................................e................................................... + sqrdmulh v3.8H, v28.8H, v0.H[5] // ..............................................e......................................... + str q18, [x0, #432] // .......................................................................*................ + mul v5.8H, v25.8H, v0.H[0] // ..................................................e..................................... + sqrdmulh v22.8H, v24.8H, v30.8H // .........................................................................e.............. + mul v19.8H, v24.8H, v29.8H // ........................................................................e............... + str q26, [x0, #304] // .....................................................................*.................. + // gap // ........................................................................................ + // gap // ........................................................................................ + add v23.8H, v15.8H, v27.8H // ......................................................e................................. // original source code - // ldr q8, [x0, #0] // ................e.............................................................................|...............e............................................................................. - // ldr q9, [x0, #(1*(512/8))] // ...............e..............................................................................|..............e.............................................................................. - // ldr q10, [x0, #(2*(512/8))] // ...e..........................................................................................|..e.......................................................................................... - // ldr q11, [x0, #(3*(512/8))] // ....e.........................................................................................|...e......................................................................................... - // ldr q12, [x0, #(4*(512/8))] // ..........e...................................................................................|.........e................................................................................... - // ldr q13, [x0, #(5*(512/8))] // .........e....................................................................................|........e.................................................................................... - // ldr q14, [x0, #(6*(512/8))] // ........e.....................................................................................|.......e..................................................................................... - // ldr q15, [x0, #(7*(512/8))] // e.............................................................................................e............................................................................................. - // sub v24.8h, v8.8h, v9.8h // ............................................e.................................................|...........................................e................................................. - // add v8.8h, v8.8h, v9.8h // ........................................e.....................................................|.......................................e..................................................... - // mul v9.8h, v24.8h, v0.h[6] // ....................................................e.........................................|...................................................e......................................... - // sqrdmulh v24.8h, v24.8h, v0.h[7] // ...................................................e..........................................|..................................................e.......................................... - // mls v9.8h, v24.8h, v7.h[0] // .................................................................e............................|................................................................e............................ - // sub v24.8h, v10.8h, v11.8h // .......................e......................................................................|......................e...................................................................... - // add v10.8h, v10.8h, v11.8h // .............................e................................................................|............................e................................................................ - // mul v11.8h, v24.8h, v1.h[0] // .................................e............................................................|................................e............................................................ - // sqrdmulh v24.8h, v24.8h, v1.h[1] // ................................e.............................................................|...............................e............................................................. - // mls v11.8h, v24.8h, v7.h[0] // ...............................................................e..............................|..............................................................e.............................. - // sub v24.8h, v12.8h, v13.8h // ...................................e..........................................................|..................................e.......................................................... - // add v12.8h, v12.8h, v13.8h // ............................e.................................................................|...........................e................................................................. - // mul v13.8h, v24.8h, v1.h[2] // ..........................................e...................................................|.........................................e................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[3] // ...........................................e..................................................|..........................................e.................................................. - // mls v13.8h, v24.8h, v7.h[0] // .......................................................e......................................|......................................................e...................................... - // sub v24.8h, v14.8h, v15.8h // ..............................e...............................................................|.............................e............................................................... - // add v14.8h, v14.8h, v15.8h // ...............................e..............................................................|..............................e.............................................................. - // mul v15.8h, v24.8h, v1.h[4] // ......................................e.......................................................|.....................................e....................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[5] // .....................................e........................................................|....................................e........................................................ - // mls v15.8h, v24.8h, v7.h[0] // ............................................................e.................................|...........................................................e................................. - // sub v24.8h, v8.8h, v10.8h // ................................................e.............................................|...............................................e............................................. - // add v8.8h, v8.8h, v10.8h // ...............................................e..............................................|..............................................e.............................................. - // mul v10.8h, v24.8h, v0.h[2] // ......................................................e.......................................|.....................................................e....................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // .....................................................e........................................|....................................................e........................................ - // mls v10.8h, v24.8h, v7.h[0] // ....................................................................e.........................|...................................................................e......................... - // sub v24.8h, v9.8h, v11.8h // .............................................................................e................|............................................................................e................ - // add v9.8h, v9.8h, v11.8h // ..........................................................................e...................|.........................................................................e................... - // mul v11.8h, v24.8h, v0.h[2] // .......................................................................................e......|......................................................................................e...... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ......................................................................................e.......|.....................................................................................e....... - // mls v11.8h, v24.8h, v7.h[0] // ............*.................................................................................|...........*................................................................................. - // sub v24.8h, v12.8h, v14.8h // .........................................e....................................................|........................................e.................................................... - // add v12.8h, v12.8h, v14.8h // .......................................e......................................................|......................................e...................................................... - // mul v14.8h, v24.8h, v0.h[4] // ..................................................e...........................................|.................................................e........................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ........................................................e.....................................|.......................................................e..................................... - // mls v14.8h, v24.8h, v7.h[0] // ...................................................................e..........................|..................................................................e.......................... - // sub v24.8h, v13.8h, v15.8h // .........................................................................e....................|........................................................................e.................... - // add v13.8h, v13.8h, v15.8h // ............................................................................e.................|...........................................................................e................. - // mul v15.8h, v24.8h, v0.h[4] // ..............................................................................e...............|.............................................................................e............... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...............................................................................e..............|..............................................................................e.............. - // mls v15.8h, v24.8h, v7.h[0] // ...........*..................................................................................|..........*.................................................................................. - // sqdmulh v25.8h, v8.8h, v7.h[1] // ...........................................................e..................................|..........................................................e.................................. - // srshr v25.8h, v25.8h, #11 // ........................................................................e.....................|.......................................................................e..................... - // mls v8.8h, v25.8h, v7.h[0] // .....................................................................................e........|....................................................................................e........ - // sqdmulh v25.8h, v12.8h, v7.h[1] // ..............................................e...............................................|.............................................e............................................... - // srshr v25.8h, v25.8h, #11 // .........................................................e....................................|........................................................e.................................... - // mls v12.8h, v25.8h, v7.h[0] // ......................................................................e.......................|.....................................................................e....................... - // sub v24.8h, v8.8h, v12.8h // .......*......................................................................................|......*...................................................................................... - // add v8.8h, v8.8h, v12.8h // ......*.......................................................................................|.....*....................................................................................... - // mul v12.8h, v24.8h, v0.h[0] // ....................*.........................................................................|...................*......................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................*.....................................................................|.......................*..................................................................... - // mls v12.8h, v24.8h, v7.h[0] // .............................................*................................................|............................................*................................................ - // sub v24.8h, v9.8h, v13.8h // ...................................................................................e..........|..................................................................................e.......... - // add v9.8h, v9.8h, v13.8h // .*............................................................................................|*............................................................................................ - // mul v13.8h, v24.8h, v0.h[0] // .....*........................................................................................|....*........................................................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..*...........................................................................................|.*........................................................................................... - // mls v13.8h, v24.8h, v7.h[0] // .....................*........................................................................|....................*........................................................................ - // sub v24.8h, v10.8h, v14.8h // ................................................................................e.............|...............................................................................e............. - // add v10.8h, v10.8h, v14.8h // .................................................................................e............|................................................................................e............ - // mul v14.8h, v24.8h, v0.h[0] // ........................................................................................e.....|.......................................................................................e..... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .........................................................................................e....|........................................................................................e.... - // mls v14.8h, v24.8h, v7.h[0] // .................*............................................................................|................*............................................................................ - // sub v24.8h, v11.8h, v15.8h // .................................................*............................................|................................................*............................................ - // add v11.8h, v11.8h, v15.8h // .........................*....................................................................|........................*.................................................................... - // mul v15.8h, v24.8h, v0.h[0] // ................................................................*.............................|...............................................................*............................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................................*...............................|.............................................................*............................... - // mls v15.8h, v24.8h, v7.h[0] // ...........................................................................*..................|..........................................................................*.................. - // str q12, [x0, #(4*(512/8))] // .............................................................*................................|............................................................*................................ - // str q13, [x0, #(5*(512/8))] // ..................................*...........................................................|.................................*........................................................... - // str q14, [x0, #(6*(512/8))] // ..........................*...................................................................|.........................*................................................................... - // str q15, [x0, #(7*(512/8))] // ............................................................................................*.|...........................................................................................*. - // mul v12.8h, v8.8h, v29.8h // ...................*..........................................................................|..................*.......................................................................... - // sqrdmulh v8.8h, v8.8h, v30.8h // ......................*.......................................................................|.....................*....................................................................... - // mls v12.8h, v8.8h, v7.h[0] // ..................................................................*...........................|.................................................................*........................... - // mul v13.8h, v9.8h, v29.8h // .............*................................................................................|............*................................................................................ - // sqrdmulh v9.8h, v9.8h, v30.8h // ..............*...............................................................................|.............*............................................................................... - // mls v13.8h, v9.8h, v7.h[0] // .....................................................................*........................|....................................................................*........................ - // mul v14.8h, v10.8h, v29.8h // ..........................................................................................e...|.........................................................................................e... - // sqrdmulh v10.8h, v10.8h, v30.8h // ...........................................................................................e..|..........................................................................................e.. - // mls v14.8h, v10.8h, v7.h[0] // ..................*...........................................................................|.................*........................................................................... - // mul v15.8h, v11.8h, v29.8h // ..........................................................*...................................|.........................................................*................................... - // sqrdmulh v11.8h, v11.8h, v30.8h // ....................................*.........................................................|...................................*......................................................... - // mls v15.8h, v11.8h, v7.h[0] // .......................................................................*......................|......................................................................*...................... - // str q12, [x0], #(16) // ....................................................................................*.........|...................................................................................*......... - // str q13, [x0, #(-16 + 1*(512/8))] // .............................................................................................*|............................................................................................* - // str q14, [x0, #(-16 + 2*(512/8))] // ...........................*..................................................................|..........................*.................................................................. - // str q15, [x0, #(-16 + 3*(512/8))] // ..................................................................................*...........|.................................................................................*........... + // ldr q8, [x0, #0] // e......................................................................................|e..................................................................................... + // ldr q9, [x0, #(1*(512/8))] // .......e...............................................................................|.......e.............................................................................. + // ldr q10, [x0, #(2*(512/8))] // .....e.................................................................................|.....e................................................................................ + // ldr q11, [x0, #(3*(512/8))] // .e.....................................................................................|.e.................................................................................... + // ldr q12, [x0, #(4*(512/8))] // .................e.....................................................................|.................e.................................................................... + // ldr q13, [x0, #(5*(512/8))] // ................e......................................................................|................e..................................................................... + // ldr q14, [x0, #(6*(512/8))] // ......e................................................................................|......e............................................................................... + // ldr q15, [x0, #(7*(512/8))] // ........e..............................................................................|........e............................................................................. + // sub v24.8h, v8.8h, v9.8h // ...........................e...........................................................|...........................e.......................................................... + // add v8.8h, v8.8h, v9.8h // ............................e..........................................................|............................e......................................................... + // mul v9.8h, v24.8h, v0.h[6] // ..........................................e............................................|..........................................e........................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // .........................................e.............................................|.........................................e............................................ + // mls v9.8h, v24.8h, v7.h[0] // ........................................................e..............................|........................................................e............................. + // sub v24.8h, v10.8h, v11.8h // .........................e.............................................................|.........................e............................................................ + // add v10.8h, v10.8h, v11.8h // .......................e...............................................................|.......................e.............................................................. + // mul v11.8h, v24.8h, v1.h[0] // ...............................e.......................................................|...............................e...................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ........................................e..............................................|........................................e............................................. + // mls v11.8h, v24.8h, v7.h[0] // ..................................................e....................................|..................................................e................................... + // sub v24.8h, v12.8h, v13.8h // ................................e......................................................|................................e..................................................... + // add v12.8h, v12.8h, v13.8h // ......................................e................................................|......................................e............................................... + // mul v13.8h, v24.8h, v1.h[2] // ............................................e..........................................|............................................e......................................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // ...........................................e...........................................|...........................................e.......................................... + // mls v13.8h, v24.8h, v7.h[0] // .......................................................e...............................|.......................................................e.............................. + // sub v24.8h, v14.8h, v15.8h // ..............................e........................................................|..............................e....................................................... + // add v14.8h, v14.8h, v15.8h // .............................e.........................................................|.............................e........................................................ + // mul v15.8h, v24.8h, v1.h[4] // .......................................e...............................................|.......................................e.............................................. + // sqrdmulh v24.8h, v24.8h, v1.h[5] // .....................................e.................................................|.....................................e................................................ + // mls v15.8h, v24.8h, v7.h[0] // .................................................e.....................................|.................................................e.................................... + // sub v24.8h, v8.8h, v10.8h // ...............................................e.......................................|...............................................e...................................... + // add v8.8h, v8.8h, v10.8h // ...................................................e...................................|...................................................e.................................. + // mul v10.8h, v24.8h, v0.h[2] // ...........................................................e...........................|...........................................................e.......................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..........................................................e............................|..........................................................e........................... + // mls v10.8h, v24.8h, v7.h[0] // ........................................................................e..............|........................................................................e............. + // sub v24.8h, v9.8h, v11.8h // ....................................................................e..................|....................................................................e................. + // add v9.8h, v9.8h, v11.8h // ......................................................................e................|......................................................................e............... + // mul v11.8h, v24.8h, v0.h[2] // ..............................................................................e........|..............................................................................e....... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...............................................................................e.......|...............................................................................e...... + // mls v11.8h, v24.8h, v7.h[0] // .........*.............................................................................|.........*............................................................................ + // sub v24.8h, v12.8h, v14.8h // ..............................................e........................................|..............................................e....................................... + // add v12.8h, v12.8h, v14.8h // ....................................................e..................................|....................................................e................................. + // mul v14.8h, v24.8h, v0.h[4] // ............................................................e..........................|............................................................e......................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .........................................................e.............................|.........................................................e............................ + // mls v14.8h, v24.8h, v7.h[0] // .......................................................................e...............|.......................................................................e.............. + // sub v24.8h, v13.8h, v15.8h // .....................................................................e.................|.....................................................................e................ + // add v13.8h, v13.8h, v15.8h // .................................................................e.....................|.................................................................e.................... + // mul v15.8h, v24.8h, v0.h[4] // .............................................................................e.........|.............................................................................e........ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ................................................................................e......|................................................................................e..... + // mls v15.8h, v24.8h, v7.h[0] // ..........*............................................................................|..........*........................................................................... + // sub v24.8h, v8.8h, v12.8h // .........................................................................e.............|.........................................................................e............ + // add v8.8h, v8.8h, v12.8h // ................................................................e......................|................................................................e..................... + // mul v12.8h, v24.8h, v0.h[0] // ..................................................................................e....|..................................................................................e... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..*....................................................................................|..*................................................................................... + // mls v12.8h, v24.8h, v7.h[0] // ......................*................................................................|......................*............................................................... + // sub v24.8h, v9.8h, v13.8h // ...*...................................................................................|...*.................................................................................. + // add v9.8h, v9.8h, v13.8h // ......................................................................................e|...................................................................................... + // mul v13.8h, v24.8h, v0.h[0] // ...............*.......................................................................|...............*...................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..................*....................................................................|..................*................................................................... + // mls v13.8h, v24.8h, v7.h[0] // ...............................................................*.......................|...............................................................*...................... + // sub v24.8h, v10.8h, v14.8h // .......................................................................................*...................................................................................... + // add v10.8h, v10.8h, v14.8h // ....*..................................................................................|....*................................................................................. + // mul v14.8h, v24.8h, v0.h[0] // ..............*........................................................................|..............*....................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .............*.........................................................................|.............*........................................................................ + // mls v14.8h, v24.8h, v7.h[0] // ..............................................................*........................|..............................................................*....................... + // sub v24.8h, v11.8h, v15.8h // ................................................*......................................|................................................*..................................... + // add v11.8h, v11.8h, v15.8h // ..........................*............................................................|..........................*........................................................... + // mul v15.8h, v24.8h, v0.h[0] // ......................................................*................................|......................................................*............................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .....................................................*.................................|.....................................................*................................ + // mls v15.8h, v24.8h, v7.h[0] // ..................................................................*....................|..................................................................*................... + // str q12, [x0, #(4*(512/8))] // ..................................*....................................................|..................................*................................................... + // str q13, [x0, #(5*(512/8))] // .....................................................................................*.|.....................................................................................* + // str q14, [x0, #(6*(512/8))] // ............................................................................*..........|............................................................................*......... + // str q15, [x0, #(7*(512/8))] // .................................................................................*.....|.................................................................................*.... + // mul v12.8h, v8.8h, v29.8h // ....................................................................................e..|....................................................................................e. + // sqrdmulh v8.8h, v8.8h, v30.8h // ...................................................................................e...|...................................................................................e.. + // mls v12.8h, v8.8h, v7.h[0] // ...................*...................................................................|...................*.................................................................. + // mul v13.8h, v9.8h, v29.8h // ...........*...........................................................................|...........*.......................................................................... + // sqrdmulh v9.8h, v9.8h, v30.8h // ............*..........................................................................|............*......................................................................... + // mls v13.8h, v9.8h, v7.h[0] // ........................*..............................................................|........................*............................................................. + // mul v14.8h, v10.8h, v29.8h // ....................*..................................................................|....................*................................................................. + // sqrdmulh v10.8h, v10.8h, v30.8h // .....................*.................................................................|.....................*................................................................ + // mls v14.8h, v10.8h, v7.h[0] // .............................................................*.........................|.............................................................*........................ + // mul v15.8h, v11.8h, v29.8h // ...................................*...................................................|...................................*.................................................. + // sqrdmulh v11.8h, v11.8h, v30.8h // ....................................*..................................................|....................................*................................................. + // mls v15.8h, v11.8h, v7.h[0] // .............................................*.........................................|.............................................*........................................ + // str q12, [x0], #(16) // .................................*.....................................................|.................................*.................................................... + // str q13, [x0, #(-16 + 1*(512/8))] // ..........................................................................*............|..........................................................................*........... + // str q14, [x0, #(-16 + 2*(512/8))] // ...........................................................................*...........|...........................................................................*.......... + // str q15, [x0, #(-16 + 3*(512/8))] // ...................................................................*...................|...................................................................*.................. sub count, count, #1 cbnz count, layer123_start - mls v21.8H, v4.8H, v7.H[0] // ......*............................ - mls v25.8H, v3.8H, v7.H[0] // .....*............................. - sub v31.8H, v28.8H, v6.8H // ....*.............................. - add v22.8H, v28.8H, v6.8H // ...*............................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - mul v23.8H, v13.8H, v0.H[0] // ..*................................ - sqrdmulh v8.8H, v13.8H, v0.H[1] // .*................................. - mls v2.8H, v17.8H, v7.H[0] // ..........*........................ - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - sqrdmulh v11.8H, v22.8H, v30.8H // ..............*.................... - mul v9.8H, v22.8H, v29.8H // ...........*....................... - sqrdmulh v17.8H, v31.8H, v0.H[1] // ...............*................... - add v18.8H, v27.8H, v19.8H // *.................................. - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - sub v22.8H, v21.8H, v25.8H // ......................*............ - add v21.8H, v21.8H, v25.8H // ................*.................. - mul v19.8H, v31.8H, v0.H[0] // ............*...................... - mls v20.8H, v16.8H, v7.H[0] // .........*......................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - mls v23.8H, v8.8H, v7.H[0] // .............*..................... - mul v6.8H, v18.8H, v29.8H // .......*........................... - sqrdmulh v26.8H, v18.8H, v30.8H // ........*.......................... - str q2, [x0, #128] // ..................*................ - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - sqrdmulh v2.8H, v21.8H, v30.8H // ....................*.............. - mul v28.8H, v21.8H, v29.8H // .......................*........... - mul v13.8H, v22.8H, v0.H[0] // ..........................*........ - sqrdmulh v27.8H, v22.8H, v0.H[1] // .........................*......... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - mls v9.8H, v11.8H, v7.H[0] // ...........................*....... - mls v19.8H, v17.8H, v7.H[0] // .....................*............. - str q20, [x0, #384] // .................*................. - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - str q23, [x0, #320] // ...................*............... - mls v6.8H, v26.8H, v7.H[0] // ............................*...... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - mls v28.8H, v2.8H, v7.H[0] // .............................*..... - mls v13.8H, v27.8H, v7.H[0] // ..............................*.... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - str q9, [x0], #(16) // ................................*.. - str q19, [x0, #240] // ........................*.......... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - str q6, [x0, #48] // ..................................* - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - str q28, [x0, #176] // ...............................*... - str q13, [x0, #432] // .................................*. - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... + mls v19.8H, v22.8H, v7.H[0] // ............*....................... + sub v24.8H, v15.8H, v27.8H // ..*................................. + mls v10.8H, v3.8H, v7.H[0] // .....*.............................. + mls v4.8H, v20.8H, v7.H[0] // ....*............................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + add v17.8H, v16.8H, v13.8H // ...*................................ + sqrdmulh v6.8H, v23.8H, v30.8H // .......*............................ + sub v21.8H, v16.8H, v13.8H // *................................... + mul v26.8H, v23.8H, v29.8H // ......*............................. + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sqrdmulh v16.8H, v25.8H, v0.H[1] // .*.................................. + mul v2.8H, v24.8H, v0.H[0] // ..........*......................... + sqrdmulh v20.8H, v24.8H, v0.H[1] // ...........*........................ + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + str q19, [x0], #(16) // ..................*................. + sqrdmulh v15.8H, v21.8H, v0.H[1] // ........*........................... + mul v31.8H, v21.8H, v0.H[0] // .........*.......................... + add v13.8H, v4.8H, v10.8H // .................*.................. + sub v4.8H, v4.8H, v10.8H // .......................*............ + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v26.8H, v6.8H, v7.H[0] // ................*................... + mul v27.8H, v17.8H, v29.8H // .............*...................... + sqrdmulh v17.8H, v17.8H, v30.8H // ..............*..................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mul v24.8H, v13.8H, v29.8H // ....................*............... + sqrdmulh v3.8H, v13.8H, v30.8H // .....................*.............. + sqrdmulh v25.8H, v4.8H, v0.H[1] // ........................*........... + mul v12.8H, v4.8H, v0.H[0] // .........................*.......... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v31.8H, v15.8H, v7.H[0] // ...........................*........ + mls v5.8H, v16.8H, v7.H[0] // ...............*.................... + mls v2.8H, v20.8H, v7.H[0] // ............................*....... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v27.8H, v17.8H, v7.H[0] // ..........................*......... + str q26, [x0, #48] // ...............................*.... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v24.8H, v3.8H, v7.H[0] // ......................*............. + mls v12.8H, v25.8H, v7.H[0] // .............................*...... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + str q31, [x0, #368] // .................................*.. + str q5, [x0, #240] // ...................*................ + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + str q27, [x0, #112] // ................................*... + str q2, [x0, #304] // ...................................* + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + str q24, [x0, #176] // ..............................*..... + str q12, [x0, #432] // ..................................*. + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... // original source code - // add v26.8H, v27.8H, v19.8H // ..........*........................ - // sqrdmulh v23.8H, v13.8H, v0.H[1] // .....*............................. - // mul v27.8H, v13.8H, v0.H[0] // ....*.............................. - // add v10.8H, v28.8H, v6.8H // ...*............................... - // sub v13.8H, v28.8H, v6.8H // ..*................................ - // mls v25.8H, v3.8H, v7.H[0] // .*................................. - // mls v21.8H, v4.8H, v7.H[0] // *.................................. - // mul v14.8H, v26.8H, v29.8H // ................*.................. - // sqrdmulh v3.8H, v26.8H, v30.8H // .................*................. - // mls v20.8H, v16.8H, v7.H[0] // ..............*.................... - // mls v2.8H, v17.8H, v7.H[0] // ......*............................ - // mul v4.8H, v10.8H, v29.8H // ........*.......................... - // mul v26.8H, v13.8H, v0.H[0] // .............*..................... - // mls v27.8H, v23.8H, v7.H[0] // ...............*................... - // sqrdmulh v17.8H, v10.8H, v30.8H // .......*........................... - // sqrdmulh v10.8H, v13.8H, v0.H[1] // .........*......................... - // add v16.8H, v21.8H, v25.8H // ............*...................... - // str q20, [x0, #384] // .........................*......... - // str q2, [x0, #128] // ..................*................ - // str q27, [x0, #320] // ..........................*........ - // sqrdmulh v2.8H, v16.8H, v30.8H // ...................*............... - // mls v26.8H, v10.8H, v7.H[0] // ........................*.......... - // sub v27.8H, v21.8H, v25.8H // ...........*....................... - // mul v16.8H, v16.8H, v29.8H // ....................*.............. - // str q26, [x0, #256] // ...............................*... - // sqrdmulh v26.8H, v27.8H, v0.H[1] // ......................*............ - // mul v11.8H, v27.8H, v0.H[0] // .....................*............. - // mls v4.8H, v17.8H, v7.H[0] // .......................*........... - // mls v14.8H, v3.8H, v7.H[0] // ...........................*....... - // mls v16.8H, v2.8H, v7.H[0] // ............................*...... - // mls v11.8H, v26.8H, v7.H[0] // .............................*..... - // str q16, [x0, #192] // .................................*. - // str q4, [x0], #(16) // ..............................*.... - // str q11, [x0, #432] // ..................................* - // str q14, [x0, #48] // ................................*.. + // sub v26.8H, v16.8H, v13.8H // ......*............................. + // sqrdmulh v11.8H, v25.8H, v0.H[1] // ........*........................... + // sub v27.8H, v15.8H, v27.8H // .*.................................. + // add v28.8H, v16.8H, v13.8H // ....*............................... + // mls v4.8H, v20.8H, v7.H[0] // ...*................................ + // mls v10.8H, v3.8H, v7.H[0] // ..*................................. + // mul v31.8H, v23.8H, v29.8H // .......*............................ + // sqrdmulh v17.8H, v23.8H, v30.8H // .....*.............................. + // sqrdmulh v14.8H, v26.8H, v0.H[1] // ............*....................... + // mul v12.8H, v26.8H, v0.H[0] // .............*...................... + // mul v26.8H, v27.8H, v0.H[0] // .........*.......................... + // sqrdmulh v20.8H, v27.8H, v0.H[1] // ..........*......................... + // mls v19.8H, v22.8H, v7.H[0] // *................................... + // mul v3.8H, v28.8H, v29.8H // .................*.................. + // sqrdmulh v24.8H, v28.8H, v30.8H // ..................*................. + // mls v5.8H, v11.8H, v7.H[0] // ........................*........... + // mls v31.8H, v17.8H, v7.H[0] // ................*................... + // add v23.8H, v4.8H, v10.8H // ..............*..................... + // str q19, [x0], #(16) // ...........*........................ + // str q5, [x0, #240] // ...............................*.... + // mul v22.8H, v23.8H, v29.8H // ...................*................ + // sqrdmulh v25.8H, v23.8H, v30.8H // ....................*............... + // mls v22.8H, v25.8H, v7.H[0] // ............................*....... + // sub v19.8H, v4.8H, v10.8H // ...............*.................... + // sqrdmulh v28.8H, v19.8H, v0.H[1] // .....................*.............. + // mul v18.8H, v19.8H, v0.H[0] // ......................*............. + // mls v3.8H, v24.8H, v7.H[0] // ..........................*......... + // mls v12.8H, v14.8H, v7.H[0] // .......................*............ + // mls v26.8H, v20.8H, v7.H[0] // .........................*.......... + // mls v18.8H, v28.8H, v7.H[0] // .............................*...... + // str q22, [x0, #176] // ..................................*. + // str q31, [x0, #48] // ...........................*........ + // str q3, [x0, #112] // ................................*... + // str q12, [x0, #368] // ..............................*..... + // str q18, [x0, #432] // ...................................* + // str q26, [x0, #304] // .................................*.. pop_stack diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_m1_icestorm.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_m1_icestorm.s index 82bdd3f..bbadddf 100644 --- a/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_m1_icestorm.s +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_opt_m1_icestorm.s @@ -354,500 +354,548 @@ _intt_kyber_123_4567_opt_m1_icestorm: mov count, #8 .p2align 2 - ldr q0, [x1, #32] // .*............................................... - ldr q2, [x1, #48] // *................................................ + ldr q19, [x1, #16] // ...*............................................. + ldr q1, [x1, #0] // ..*.............................................. // gap // ................................................. // gap // ................................................. - ldr q16, [x1, #0] // ..*.............................................. - ldr q23, [x1, #16] // ...*............................................. // gap // ................................................. + ldr q0, [x1, #32] // .*............................................... // gap // ................................................. - ldr q21, [x4, #64] // ....*............................................ - ldr q26, [x4], #(6*16) // .....*........................................... + ldr q22, [x1, #48] // *................................................ // gap // ................................................. // gap // ................................................. - ldr q20, [x4, #-16] // ......*.......................................... - ldr q17, [x4, #-48] // .........*....................................... // gap // ................................................. // gap // ................................................. - trn1 v30.4S, v0.4S, v2.4S // .......*......................................... - trn2 v0.4S, v0.4S, v2.4S // ........*........................................ - ldr q8, [x4, #-64] // ..........*...................................... - ldr q4, [x4, #-80] // .............*................................... - trn1 v25.4S, v16.4S, v23.4S // ...........*..................................... - trn2 v16.4S, v16.4S, v23.4S // ............*.................................... - ldr q15, [x3], #16 // ......................................*.......... + ldr q6, [x3], #16 // .......................................*......... // gap // ................................................. // gap // ................................................. // gap // ................................................. + ldr q11, [x4, #48] // ........*........................................ + trn1 v27.4S, v1.4S, v19.4S // ...........*..................................... + trn2 v24.4S, v1.4S, v19.4S // .........*....................................... // gap // ................................................. + trn2 v17.4S, v0.4S, v22.4S // .......*......................................... // gap // ................................................. - trn2 v11.2D, v16.2D, v0.2D // ...............*................................. - trn2 v23.2D, v25.2D, v30.2D // ..............*.................................. + ldr q19, [x4, #80] // .................*............................... + trn1 v8.4S, v0.4S, v22.4S // ......*.......................................... // gap // ................................................. // gap // ................................................. - trn1 v30.2D, v25.2D, v30.2D // .................*............................... - trn1 v0.2D, v16.2D, v0.2D // ................*................................ + ldr q0, [x4, #32] // .....*........................................... // gap // ................................................. + trn1 v4.2D, v27.2D, v8.2D // ...............*................................. // gap // ................................................. - sub v16.8H, v23.8H, v11.8H // ...................*............................. - add v23.8H, v23.8H, v11.8H // ..................*.............................. // gap // ................................................. + trn1 v13.2D, v24.2D, v17.2D // ............*.................................... + ldr q2, [x4, #64] // ....*............................................ + trn2 v22.2D, v27.2D, v8.2D // ................*................................ // gap // ................................................. - sub v25.8H, v30.8H, v0.8H // .....................*........................... - add v0.8H, v30.8H, v0.8H // ....................*............................ + trn2 v12.2D, v24.2D, v17.2D // .............*................................... // gap // ................................................. // gap // ................................................. - mul v21.8H, v16.8H, v21.8H // .......................*......................... - sqrdmulh v20.8H, v16.8H, v20.8H // ......................*.......................... // gap // ................................................. + sub v5.8H, v4.8H, v13.8H // ..................*.............................. + add v28.8H, v4.8H, v13.8H // ....................*............................ // gap // ................................................. // gap // ................................................. - sqrdmulh v17.8H, v25.8H, v17.8H // ........................*........................ + sub v17.8H, v22.8H, v12.8H // ...................*............................. // gap // ................................................. - mul v2.8H, v25.8H, v8.8H // .........................*....................... - sub v16.8H, v0.8H, v23.8H // ..........................*...................... - add v0.8H, v0.8H, v23.8H // .....................................*........... + mul v23.8H, v5.8H, v0.8H // ......................*.......................... // gap // ................................................. + sqrdmulh v10.8H, v5.8H, v11.8H // ........................*........................ + mul v30.8H, v17.8H, v2.8H // .......................*......................... // gap // ................................................. - mls v21.8H, v20.8H, v7.H[0] // ...........................*..................... // gap // ................................................. + sqrdmulh v19.8H, v17.8H, v19.8H // .....................*........................... + add v16.8H, v22.8H, v12.8H // .........................*....................... + ldr q2, [x4, #16] // ..........*...................................... // gap // ................................................. // gap // ................................................. - sqrdmulh v8.8H, v16.8H, v4.8H // ..............................*.................. + mls v23.8H, v10.8H, v7.H[0] // ..........................*...................... // gap // ................................................. // gap // ................................................. - mls v2.8H, v17.8H, v7.H[0] // ............................*.................... - mul v23.8H, v16.8H, v26.8H // .............................*................... // gap // ................................................. + mls v30.8H, v19.8H, v7.H[0] // ...........................*..................... + sub v11.8H, v28.8H, v16.8H // ............................*.................... // gap // ................................................. + ldr q19, [x4], #(6*16) // ..............*.................................. // gap // ................................................. // gap // ................................................. // gap // ................................................. // gap // ................................................. + sqrdmulh v22.8H, v11.8H, v2.8H // .............................*................... // gap // ................................................. // gap // ................................................. // gap // ................................................. // gap // ................................................. - sub v16.8H, v2.8H, v21.8H // ...............................*................. // gap // ................................................. - add v2.8H, v2.8H, v21.8H // ..................................*.............. // gap // ................................................. - mls v23.8H, v8.8H, v7.H[0] // ...................................*............. - mul v21.8H, v16.8H, v26.8H // ................................*................ + sub v8.8H, v23.8H, v30.8H // ..............................*.................. + mul v21.8H, v11.8H, v19.8H // ...............................*................. // gap // ................................................. // gap // ................................................. - sqrdmulh v16.8H, v16.8H, v4.8H // .................................*............... // gap // ................................................. // gap // ................................................. // gap // ................................................. + mul v3.8H, v8.8H, v19.8H // .................................*............... + sqrdmulh v13.8H, v8.8H, v2.8H // ................................*................ // gap // ................................................. - trn2 v26.4S, v0.4S, v2.4S // .......................................*......... - trn1 v0.4S, v0.4S, v2.4S // ........................................*........ // gap // ................................................. // gap // ................................................. // gap // ................................................. + add v8.8H, v23.8H, v30.8H // ...................................*............. + add v19.8H, v28.8H, v16.8H // ..................................*.............. // gap // ................................................. // gap // ................................................. - mls v21.8H, v16.8H, v7.H[0] // ....................................*............ + mls v21.8H, v22.8H, v7.H[0] // .....................................*........... // gap // ................................................. // gap // ................................................. + mls v3.8H, v13.8H, v7.H[0] // ....................................*............ + trn1 v5.4S, v19.4S, v8.4S // ......................................*.......... // gap // ................................................. // gap // ................................................. // gap // ................................................. + trn2 v8.4S, v19.4S, v8.4S // ........................................*........ // gap // ................................................. // gap // ................................................. // gap // ................................................. - trn1 v16.4S, v23.4S, v21.4S // ..........................................*...... + trn1 v13.4S, v21.4S, v3.4S // ..........................................*...... // gap // ................................................. // gap // ................................................. - trn2 v2.4S, v23.4S, v21.4S // .........................................*....... + trn2 v30.4S, v21.4S, v3.4S // .........................................*....... // gap // ................................................. // gap // ................................................. // gap // ................................................. // gap // ................................................. - trn1 v17.2D, v0.2D, v16.2D // ............................................*.... + trn2 v14.2D, v8.2D, v30.2D // ............................................*.... // gap // ................................................. // gap // ................................................. - trn1 v23.2D, v26.2D, v2.2D // ...........................................*..... - trn2 v21.2D, v26.2D, v2.2D // .............................................*... - trn2 v20.2D, v0.2D, v16.2D // ..............................................*.. + trn2 v19.2D, v5.2D, v13.2D // ...........................................*..... + trn1 v22.2D, v8.2D, v30.2D // .............................................*... + trn1 v28.2D, v5.2D, v13.2D // ..............................................*.. // gap // ................................................. // gap // ................................................. - sub v26.8H, v17.8H, v23.8H // ................................................* + add v3.8H, v19.8H, v14.8H // ...............................................*. // gap // ................................................. // gap // ................................................. - add v4.8H, v17.8H, v23.8H // ...............................................*. + sub v19.8H, v19.8H, v14.8H // ................................................* // original source code - // ldr q16, [x1, #48] // .*............................................... - // ldr q1, [x1, #32] // *................................................ - // ldr q19, [x1, #0] // ..*.............................................. - // ldr q23, [x1, #16] // ...*............................................. - // ldr q29, [x4, #64] // ....*............................................ - // ldr q14, [x4], #(6*16) // .....*........................................... - // ldr q28, [x4, #-16] // ......*.......................................... - // trn1 v10.4S, v1.4S, v16.4S // ........*........................................ - // trn2 v26.4S, v1.4S, v16.4S // .........*....................................... - // ldr q1, [x4, #-48] // .......*......................................... - // ldr q3, [x4, #-64] // ..........*...................................... - // trn1 v8.4S, v19.4S, v23.4S // ............*.................................... - // trn2 v24.4S, v19.4S, v23.4S // .............*................................... - // ldr q11, [x4, #-80] // ...........*..................................... - // trn2 v25.2D, v8.2D, v10.2D // ................*................................ - // trn2 v16.2D, v24.2D, v26.2D // ...............*................................. - // trn1 v13.2D, v24.2D, v26.2D // ..................*.............................. - // trn1 v5.2D, v8.2D, v10.2D // .................*............................... - // add v2.8H, v25.8H, v16.8H // ....................*............................ - // sub v16.8H, v25.8H, v16.8H // ...................*............................. - // add v19.8H, v5.8H, v13.8H // ......................*.......................... - // sub v25.8H, v5.8H, v13.8H // .....................*........................... - // sqrdmulh v26.8H, v16.8H, v28.8H // ........................*........................ - // mul v10.8H, v16.8H, v29.8H // .......................*......................... - // sqrdmulh v16.8H, v25.8H, v1.8H // .........................*....................... - // mul v13.8H, v25.8H, v3.8H // ..........................*...................... - // sub v23.8H, v19.8H, v2.8H // ...........................*..................... - // mls v10.8H, v26.8H, v7.H[0] // .............................*................... - // mls v13.8H, v16.8H, v7.H[0] // ...............................*................. - // mul v18.8H, v23.8H, v14.8H // ................................*................ - // sqrdmulh v17.8H, v23.8H, v11.8H // ..............................*.................. - // sub v26.8H, v13.8H, v10.8H // .................................*............... - // mul v23.8H, v26.8H, v14.8H // ....................................*............ - // sqrdmulh v25.8H, v26.8H, v11.8H // .....................................*........... - // add v11.8H, v13.8H, v10.8H // ..................................*.............. - // mls v18.8H, v17.8H, v7.H[0] // ...................................*............. - // mls v23.8H, v25.8H, v7.H[0] // ........................................*........ - // add v2.8H, v19.8H, v2.8H // ............................*.................... - // ldr q15, [x3], #16 // ..............*.................................. - // trn2 v26.4S, v2.4S, v11.4S // ......................................*.......... - // trn1 v30.4S, v2.4S, v11.4S // .......................................*......... - // trn2 v14.4S, v18.4S, v23.4S // ..........................................*...... - // trn1 v17.4S, v18.4S, v23.4S // .........................................*....... - // trn1 v23.2D, v26.2D, v14.2D // ............................................*.... - // trn1 v2.2D, v30.2D, v17.2D // ...........................................*..... - // trn2 v21.2D, v26.2D, v14.2D // .............................................*... - // trn2 v20.2D, v30.2D, v17.2D // ..............................................*.. - // add v4.8H, v2.8H, v23.8H // ................................................* - // sub v26.8H, v2.8H, v23.8H // ...............................................*. + // ldr q30, [x1, #48] // ...*............................................. + // ldr q12, [x1, #32] // ..*.............................................. + // ldr q26, [x1, #0] // .*............................................... + // ldr q21, [x1, #16] // *................................................ + // ldr q29, [x4, #64] // ..............*.................................. + // ldr q2, [x4, #32] // ...........*..................................... + // trn1 v13.4S, v12.4S, v30.4S // ..........*...................................... + // trn2 v19.4S, v12.4S, v30.4S // ........*........................................ + // ldr q14, [x4, #48] // .....*........................................... + // trn2 v9.4S, v26.4S, v21.4S // .......*......................................... + // ldr q31, [x4, #16] // .........................*....................... + // trn1 v11.4S, v26.4S, v21.4S // ......*.......................................... + // trn1 v5.2D, v9.2D, v19.2D // .............*................................... + // trn2 v8.2D, v9.2D, v19.2D // ................*................................ + // ldr q9, [x4], #(6*16) // .............................*................... + // trn1 v12.2D, v11.2D, v13.2D // ............*.................................... + // trn2 v11.2D, v11.2D, v13.2D // ...............*................................. + // ldr q15, [x4, #-16] // .........*....................................... + // sub v19.8H, v12.8H, v5.8H // .................*............................... + // sub v26.8H, v11.8H, v8.8H // ...................*............................. + // add v5.8H, v12.8H, v5.8H // ..................*.............................. + // sqrdmulh v0.8H, v26.8H, v15.8H // .......................*......................... + // mul v10.8H, v19.8H, v2.8H // ....................*............................ + // mul v29.8H, v26.8H, v29.8H // ......................*.......................... + // sqrdmulh v14.8H, v19.8H, v14.8H // .....................*........................... + // add v11.8H, v11.8H, v8.8H // ........................*........................ + // mls v10.8H, v14.8H, v7.H[0] // ..........................*...................... + // mls v29.8H, v0.8H, v7.H[0] // ...........................*..................... + // sub v0.8H, v5.8H, v11.8H // ............................*.................... + // sqrdmulh v23.8H, v0.8H, v31.8H // ..............................*.................. + // sub v16.8H, v10.8H, v29.8H // ...............................*................. + // mul v25.8H, v0.8H, v9.8H // ................................*................ + // sqrdmulh v28.8H, v16.8H, v31.8H // ..................................*.............. + // mul v20.8H, v16.8H, v9.8H // .................................*............... + // add v5.8H, v5.8H, v11.8H // ....................................*............ + // add v14.8H, v10.8H, v29.8H // ...................................*............. + // mls v20.8H, v28.8H, v7.H[0] // ......................................*.......... + // mls v25.8H, v23.8H, v7.H[0] // .....................................*........... + // trn1 v29.4S, v5.4S, v14.4S // .......................................*......... + // ldr q6, [x3], #16 // ....*............................................ + // trn2 v26.4S, v5.4S, v14.4S // ........................................*........ + // trn2 v19.4S, v25.4S, v20.4S // ..........................................*...... + // trn1 v20.4S, v25.4S, v20.4S // .........................................*....... + // trn2 v27.2D, v29.2D, v20.2D // ............................................*.... + // trn2 v0.2D, v26.2D, v19.2D // ...........................................*..... + // trn1 v22.2D, v26.2D, v19.2D // .............................................*... + // trn1 v28.2D, v29.2D, v20.2D // ..............................................*.. + // add v3.8H, v27.8H, v0.8H // ...............................................*. + // sub v19.8H, v27.8H, v0.8H // ................................................* sub count, count, #1 layer4567_start: - sub v11.8H, v20.8H, v21.8H // ....................................................*........................ - add v30.8H, v20.8H, v21.8H // .....................................................*....................... - ldr q16, [x1, #112] // ...e......................................................................... - ldr q1, [x1, #96] // ..e.......................................................................... - sqrdmulh v17.8H, v26.8H, v15.H[3] // ..................................................*.......................... - ldr q19, [x1, #64] // e............................................................................ - sqdmulh v2.8H, v4.8H, v7.H[1] // .........................................................*................... - ldr q23, [x1, #80] // .e........................................................................... - mul v20.8H, v11.8H, v15.H[4] // ......................................................*...................... - mul v22.8H, v26.8H, v15.H[2] // .................................................*........................... - ldr q29, [x4, #64] // ................e............................................................ - ldr q14, [x4], #(6*16) // ............e................................................................ - sqrdmulh v21.8H, v11.8H, v15.H[5] // .......................................................*..................... - ldr q28, [x4, #-16] // .................e........................................................... - sqdmulh v0.8H, v30.8H, v7.H[1] // ............................................................*................ - // gap // ............................................................................. - trn1 v10.4S, v1.4S, v16.4S // ......e...................................................................... - trn2 v26.4S, v1.4S, v16.4S // .......e..................................................................... - ldr q1, [x4, #-48] // ...............e............................................................. - // gap // ............................................................................. - ldr q3, [x4, #-64] // ..............e.............................................................. - trn1 v8.4S, v19.4S, v23.4S // ....e........................................................................ - trn2 v24.4S, v19.4S, v23.4S // .....e....................................................................... - // gap // ............................................................................. - srshr v23.8H, v2.8H, #11 // ..........................................................*.................. - srshr v0.8H, v0.8H, #11 // .............................................................*............... - ldr q11, [x4, #-80] // .............e............................................................... - // gap // ............................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - trn2 v25.2D, v8.2D, v10.2D // ........e.................................................................... - trn2 v16.2D, v24.2D, v26.2D // .........e................................................................... - trn1 v13.2D, v24.2D, v26.2D // ...........e................................................................. - trn1 v5.2D, v8.2D, v10.2D // ..........e.................................................................. - // gap // ............................................................................. - // gap // ............................................................................. - add v2.8H, v25.8H, v16.8H // ........................e.................................................... - sub v16.8H, v25.8H, v16.8H // .......................e..................................................... - // gap // ............................................................................. - // gap // ............................................................................. - add v19.8H, v5.8H, v13.8H // ...................e......................................................... - sub v25.8H, v5.8H, v13.8H // ..................e.......................................................... - // gap // ............................................................................. - // gap // ............................................................................. - sqrdmulh v26.8H, v16.8H, v28.8H // ..........................e.................................................. - mul v10.8H, v16.8H, v29.8H // .........................e................................................... - // gap // ............................................................................. - // gap // ............................................................................. - sqrdmulh v16.8H, v25.8H, v1.8H // .....................e....................................................... - mul v13.8H, v25.8H, v3.8H // ....................e........................................................ - // gap // ............................................................................. - // gap // ............................................................................. - mls v4.8H, v23.8H, v7.H[0] // ...........................................................*................. - mls v30.8H, v0.8H, v7.H[0] // ..............................................................*.............. - // gap // ............................................................................. - // gap // ............................................................................. - sub v23.8H, v19.8H, v2.8H // ............................e................................................ - // gap // ............................................................................. - // gap // ............................................................................. - mls v10.8H, v26.8H, v7.H[0] // ...........................e................................................. - mls v13.8H, v16.8H, v7.H[0] // ......................e...................................................... - // gap // ............................................................................. - mls v22.8H, v17.8H, v7.H[0] // ...................................................*......................... - // gap // ............................................................................. - mul v18.8H, v23.8H, v14.8H // ..............................e.............................................. - // gap // ............................................................................. - // gap // ............................................................................. - add v0.8H, v4.8H, v30.8H // ................................................................*............ - sqrdmulh v17.8H, v23.8H, v11.8H // ...............................e............................................. - sub v16.8H, v4.8H, v30.8H // ...............................................................*............. - // gap // ............................................................................. - // gap // ............................................................................. - str q0, [x1], #(64) // .........................................................................*... - mls v20.8H, v21.8H, v7.H[0] // ........................................................*.................... - sub v26.8H, v13.8H, v10.8H // .................................e........................................... - // gap // ............................................................................. - mul v3.8H, v16.8H, v15.H[0] // .................................................................*........... - sqrdmulh v0.8H, v16.8H, v15.H[1] // ..................................................................*.......... - // gap // ............................................................................. - // gap // ............................................................................. - mul v23.8H, v26.8H, v14.8H // ...................................e......................................... - // gap // ............................................................................. - // gap // ............................................................................. - sqrdmulh v25.8H, v26.8H, v11.8H // ....................................e........................................ - add v11.8H, v13.8H, v10.8H // ..................................e.......................................... - mls v18.8H, v17.8H, v7.H[0] // ................................e............................................ - // gap // ............................................................................. - // gap // ............................................................................. - mls v3.8H, v0.8H, v7.H[0] // ...................................................................*......... - sub v31.8H, v22.8H, v20.8H // ....................................................................*........ - // gap // ............................................................................. - // gap // ............................................................................. - mls v23.8H, v25.8H, v7.H[0] // .....................................e....................................... - add v2.8H, v19.8H, v2.8H // .............................e............................................... - // gap // ............................................................................. - // gap // ............................................................................. - mul v16.8H, v31.8H, v15.H[0] // ......................................................................*...... - // gap // ............................................................................. - sqrdmulh v0.8H, v31.8H, v15.H[1] // .......................................................................*..... - ldr q15, [x3], #16 // ..............................................e.............................. - str q3, [x1, #-32] // ...........................................................................*. - trn2 v26.4S, v2.4S, v11.4S // .......................................e..................................... - trn1 v30.4S, v2.4S, v11.4S // ......................................e...................................... - // gap // ............................................................................. - trn2 v14.4S, v18.4S, v23.4S // .........................................e................................... - trn1 v17.4S, v18.4S, v23.4S // ........................................e.................................... - // gap // ............................................................................. - // gap // ............................................................................. - add v4.8H, v22.8H, v20.8H // .....................................................................*....... - // gap // ............................................................................. - // gap // ............................................................................. - mls v16.8H, v0.8H, v7.H[0] // ........................................................................*.... - trn1 v23.2D, v26.2D, v14.2D // .............................................e............................... - trn1 v2.2D, v30.2D, v17.2D // ............................................e................................ - // gap // ............................................................................. - // gap // ............................................................................. - str q4, [x1, #-48] // ..........................................................................*.. - trn2 v21.2D, v26.2D, v14.2D // ...........................................e................................. - trn2 v20.2D, v30.2D, v17.2D // ..........................................e.................................. - // gap // ............................................................................. - add v4.8H, v2.8H, v23.8H // ................................................e............................ - sub v26.8H, v2.8H, v23.8H // ...............................................e............................. - str q16, [x1, #-16] // ............................................................................* - // gap // ............................................................................. + add v25.8H, v28.8H, v22.8H // ................................................*.................................. + ldr q30, [x1, #112] // ...e............................................................................... + sub v23.8H, v28.8H, v22.8H // ...............................................*................................... + ldr q12, [x1, #96] // ..e................................................................................ + sqrdmulh v20.8H, v19.8H, v6.H[5] // .......................................................*........................... + ldr q26, [x1, #64] // e.................................................................................. + ldr q21, [x1, #80] // .e................................................................................. + mul v24.8H, v19.8H, v6.H[4] // ......................................................*............................ + ldr q29, [x4, #64] // ................e.................................................................. + sqdmulh v22.8H, v3.8H, v7.H[1] // ............................................................*...................... + mul v4.8H, v23.8H, v6.H[2] // .................................................*................................. + // gap // ................................................................................... + ldr q2, [x4, #32] // ..............e.................................................................... + sqdmulh v28.8H, v25.8H, v7.H[1] // .........................................................*......................... + // gap // ................................................................................... + sqrdmulh v23.8H, v23.8H, v6.H[3] // ..................................................*................................ + trn1 v13.4S, v12.4S, v30.4S // ......e............................................................................ + trn2 v19.4S, v12.4S, v30.4S // .......e........................................................................... + ldr q14, [x4, #48] // ...............e................................................................... + // gap // ................................................................................... + trn2 v9.4S, v26.4S, v21.4S // .....e............................................................................. + ldr q31, [x4, #16] // .............e..................................................................... + // gap // ................................................................................... + srshr v22.8H, v22.8H, #11 // .............................................................*..................... + trn1 v11.4S, v26.4S, v21.4S // ....e.............................................................................. + // gap // ................................................................................... + srshr v28.8H, v28.8H, #11 // ..........................................................*........................ + // gap // ................................................................................... + trn1 v5.2D, v9.2D, v19.2D // ...........e....................................................................... + trn2 v8.2D, v9.2D, v19.2D // .........e......................................................................... + ldr q9, [x4], #(6*16) // ............e...................................................................... + // gap // ................................................................................... + trn1 v12.2D, v11.2D, v13.2D // ..........e........................................................................ + trn2 v11.2D, v11.2D, v13.2D // ........e.......................................................................... + // gap // ................................................................................... + ldr q15, [x4, #-16] // .................e................................................................. + mls v3.8H, v22.8H, v7.H[0] // ..............................................................*.................... + mls v25.8H, v28.8H, v7.H[0] // ...........................................................*....................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v19.8H, v12.8H, v5.8H // ..................e................................................................ + sub v26.8H, v11.8H, v8.8H // .......................e........................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v5.8H, v12.8H, v5.8H // ...................e............................................................... + mls v4.8H, v23.8H, v7.H[0] // ...................................................*............................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v0.8H, v26.8H, v15.8H // ..........................e........................................................ + mul v10.8H, v19.8H, v2.8H // ....................e.............................................................. + // gap // ................................................................................... + // gap // ................................................................................... + mul v29.8H, v26.8H, v29.8H // .........................e......................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v24.8H, v20.8H, v7.H[0] // ........................................................*.......................... + sqdmulh v23.8H, v4.8H, v7.H[1] // ...............................................................*................... + sub v22.8H, v25.8H, v3.8H // .....................................................................*............. + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v14.8H, v19.8H, v14.8H // .....................e............................................................. + add v27.8H, v25.8H, v3.8H // ......................................................................*............ + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v3.8H, v22.8H, v6.H[1] // ........................................................................*.......... + // gap // ................................................................................... + // gap // ................................................................................... + sqdmulh v19.8H, v24.8H, v7.H[1] // ..................................................................*................ + srshr v23.8H, v23.8H, #11 // ................................................................*.................. + str q27, [x1], #(64) // ...............................................................................*... + add v11.8H, v11.8H, v8.8H // ........................e.......................................................... + // gap // ................................................................................... + mls v10.8H, v14.8H, v7.H[0] // ......................e............................................................ + mls v29.8H, v0.8H, v7.H[0] // ...........................e....................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v0.8H, v5.8H, v11.8H // ............................e...................................................... + // gap // ................................................................................... + // gap // ................................................................................... + srshr v19.8H, v19.8H, #11 // ...................................................................*............... + mls v4.8H, v23.8H, v7.H[0] // .................................................................*................. + mul v27.8H, v22.8H, v6.H[0] // .......................................................................*........... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v23.8H, v0.8H, v31.8H // ...............................e................................................... + sub v16.8H, v10.8H, v29.8H // .................................e................................................. + // gap // ................................................................................... + // gap // ................................................................................... + mul v25.8H, v0.8H, v9.8H // ..............................e.................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v24.8H, v19.8H, v7.H[0] // ....................................................................*.............. + sqrdmulh v28.8H, v16.8H, v31.8H // ....................................e.............................................. + mul v20.8H, v16.8H, v9.8H // ...................................e............................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v27.8H, v3.8H, v7.H[0] // .........................................................................*......... + add v5.8H, v5.8H, v11.8H // .............................e..................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v14.8H, v10.8H, v29.8H // ..................................e................................................ + // gap // ................................................................................... + // gap // ................................................................................... + sub v19.8H, v4.8H, v24.8H // ..........................................................................*........ + mls v20.8H, v28.8H, v7.H[0] // .....................................e............................................. + mls v25.8H, v23.8H, v7.H[0] // ................................e.................................................. + // gap // ................................................................................... + // gap // ................................................................................... + str q27, [x1, #-32] // .................................................................................*. + // gap // ................................................................................... + mul v23.8H, v19.8H, v6.H[0] // ............................................................................*...... + sqrdmulh v22.8H, v19.8H, v6.H[1] // .............................................................................*..... + trn1 v29.4S, v5.4S, v14.4S // ......................................e............................................ + // gap // ................................................................................... + ldr q6, [x3], #16 // ..............................................e.................................... + trn2 v26.4S, v5.4S, v14.4S // .......................................e........................................... + trn2 v19.4S, v25.4S, v20.4S // .........................................e......................................... + trn1 v20.4S, v25.4S, v20.4S // ........................................e.......................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v11.8H, v4.8H, v24.8H // ...........................................................................*....... + mls v23.8H, v22.8H, v7.H[0] // ..............................................................................*.... + trn2 v27.2D, v29.2D, v20.2D // ..........................................e........................................ + trn2 v0.2D, v26.2D, v19.2D // ...........................................e....................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v22.2D, v26.2D, v19.2D // .............................................e..................................... + trn1 v28.2D, v29.2D, v20.2D // ............................................e...................................... + // gap // ................................................................................... + str q11, [x1, #-48] // ................................................................................*.. + // gap // ................................................................................... + str q23, [x1, #-16] // ..................................................................................* + add v3.8H, v27.8H, v0.8H // .....................................................e............................. + sub v19.8H, v27.8H, v0.8H // ....................................................e.............................. // original source code - // ldr q8, [x1, #(16*0)] // ...e.......................................................................|....e....................................................................... - // ldr q9, [x1, #(16*1)] // .....e.....................................................................|......e..................................................................... - // ldr q10, [x1, #(16*2)] // .e.........................................................................|..e......................................................................... - // ldr q11, [x1, #(16*3)] // e..........................................................................|.e.......................................................................... - // trn1 v25.4s, v8.4s, v9.4s // .................e.........................................................|..................e......................................................... - // trn2 v26.4s, v8.4s, v9.4s // ..................e........................................................|...................e........................................................ - // trn1 v27.4s, v10.4s, v11.4s // .............e.............................................................|..............e............................................................. - // trn2 v28.4s, v10.4s, v11.4s // ..............e............................................................|...............e............................................................ - // trn2 v10.2d, v25.2d, v27.2d // ......................e....................................................|.......................e.................................................... - // trn2 v11.2d, v26.2d, v28.2d // .......................e...................................................|........................e................................................... - // trn1 v8.2d, v25.2d, v27.2d // .........................e.................................................|..........................e................................................. - // trn1 v9.2d, v26.2d, v28.2d // ........................e..................................................|.........................e.................................................. - // ldr q0, [x4], #(6*16) // .........e.................................................................|..........e................................................................. - // ldr q4, [x4, #(-6*16 + 1*16)] // .....................e.....................................................|......................e..................................................... - // ldr q1, [x4, #(-6*16 + 2*16)] // ................e..........................................................|.................e.......................................................... - // ldr q5, [x4, #(-6*16 + 3*16)] // ...............e...........................................................|................e........................................................... - // ldr q2, [x4, #(-6*16 + 4*16)] // ........e..................................................................|.........e.................................................................. - // ldr q6, [x4, #(-6*16 + 5*16)] // ...........e...............................................................|............e............................................................... - // sub v24.8h, v8.8h, v9.8h // .............................e.............................................|..............................e............................................. - // add v8.8h, v8.8h, v9.8h // ............................e..............................................|.............................e.............................................. - // mul v9.8h, v24.8h, v1.8h // .................................e.........................................|..................................e......................................... - // sqrdmulh v24.8h, v24.8h, v5.8h // ................................e..........................................|.................................e.......................................... - // mls v9.8h, v24.8h, v7.h[0] // ......................................e....................................|.......................................e.................................... - // sub v24.8h, v10.8h, v11.8h // ...........................e...............................................|............................e............................................... - // add v10.8h, v10.8h, v11.8h // ..........................e................................................|...........................e................................................ - // mul v11.8h, v24.8h, v2.8h // ...............................e...........................................|................................e........................................... - // sqrdmulh v24.8h, v24.8h, v6.8h // ..............................e............................................|...............................e............................................ - // mls v11.8h, v24.8h, v7.h[0] // .....................................e.....................................|......................................e..................................... - // sub v24.8h, v8.8h, v10.8h // ....................................e......................................|.....................................e...................................... - // add v8.8h, v8.8h, v10.8h // ........................................................e..................|.........................................................e.................. - // mul v10.8h, v24.8h, v0.8h // ........................................e..................................|.........................................e.................................. - // sqrdmulh v24.8h, v24.8h, v4.8h // ..........................................e................................|...........................................e................................ - // mls v10.8h, v24.8h, v7.h[0] // ....................................................e......................|.....................................................e...................... - // sub v24.8h, v9.8h, v11.8h // ..............................................e............................|...............................................e............................ - // add v9.8h, v9.8h, v11.8h // ...................................................e.......................|....................................................e....................... - // mul v11.8h, v24.8h, v0.8h // .................................................e.........................|..................................................e......................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ..................................................e........................|...................................................e........................ - // mls v11.8h, v24.8h, v7.h[0] // .......................................................e...................|........................................................e................... - // trn1 v25.4s, v8.4s, v9.4s // ..............................................................e............|...............................................................e............ - // trn2 v26.4s, v8.4s, v9.4s // .............................................................e.............|..............................................................e............. - // trn1 v27.4s, v10.4s, v11.4s // ................................................................e..........|.................................................................e.......... - // trn2 v28.4s, v10.4s, v11.4s // ...............................................................e...........|................................................................e........... - // trn2 v10.2d, v25.2d, v27.2d // .......................................................................e...|........................................................................e... - // trn2 v11.2d, v26.2d, v28.2d // ......................................................................e....|.......................................................................e.... - // trn1 v8.2d, v25.2d, v27.2d // ....................................................................e......|.....................................................................e...... - // trn1 v9.2d, v26.2d, v28.2d // ...................................................................e.......|....................................................................e....... - // ldr q0, [x3], #16 // ...........................................................e...............|............................................................e............... - // sub v24.8h, v8.8h, v9.8h // .........................................................................e.|..........................................................................e. - // add v8.8h, v8.8h, v9.8h // ........................................................................e..|.........................................................................e.. - // mul v9.8h, v24.8h, v0.h[2] // .......*...................................................................|........*................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..*........................................................................|...*........................................................................ - // mls v9.8h, v24.8h, v7.h[0] // .......................................*...................................|........................................*................................... - // sub v24.8h, v10.8h, v11.8h // ...........................................................................*............................................................................ - // add v10.8h, v10.8h, v11.8h // ...........................................................................|*........................................................................... - // mul v11.8h, v24.8h, v0.h[4] // ......*....................................................................|.......*.................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..........*................................................................|...........*................................................................ - // mls v11.8h, v24.8h, v7.h[0] // .............................................*.............................|..............................................*............................. - // sqdmulh v25.8h, v8.8h, v7.h[1] // ....*......................................................................|.....*...................................................................... - // srshr v25.8h, v25.8h, #11 // ...................*.......................................................|....................*....................................................... - // mls v8.8h, v25.8h, v7.h[0] // ..................................*........................................|...................................*........................................ - // sqdmulh v25.8h, v10.8h, v7.h[1] // ............*..............................................................|.............*.............................................................. - // srshr v25.8h, v25.8h, #11 // ....................*......................................................|.....................*...................................................... - // mls v10.8h, v25.8h, v7.h[0] // ...................................*.......................................|....................................*....................................... - // sub v24.8h, v8.8h, v10.8h // ...........................................*...............................|............................................*............................... - // add v8.8h, v8.8h, v10.8h // .........................................*.................................|..........................................*................................. - // mul v10.8h, v24.8h, v0.h[0] // ...............................................*...........................|................................................*........................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ................................................*..........................|.................................................*.......................... - // mls v10.8h, v24.8h, v7.h[0] // .....................................................*.....................|......................................................*..................... - // sub v24.8h, v9.8h, v11.8h // ......................................................*....................|.......................................................*.................... - // add v9.8h, v9.8h, v11.8h // .................................................................*.........|..................................................................*......... - // mul v11.8h, v24.8h, v0.h[0] // .........................................................*.................|..........................................................*................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........................................................*................|...........................................................*................ - // mls v11.8h, v24.8h, v7.h[0] // ..................................................................*........|...................................................................*........ - // str q8, [x1], #(64) // ............................................*..............................|.............................................*.............................. - // str q9, [x1, #(-64 + 16*1)] // .....................................................................*.....|......................................................................*..... - // str q10, [x1, #(-64 + 16*2)] // ............................................................*..............|.............................................................*.............. - // str q11, [x1, #(-64 + 16*3)] // ..........................................................................*|...........................................................................* + // ldr q8, [x1, #(16*0)] // ....e.............................................................................|....e........................................................................... + // ldr q9, [x1, #(16*1)] // .....e............................................................................|.....e.......................................................................... + // ldr q10, [x1, #(16*2)] // ..e...............................................................................|..e............................................................................. + // ldr q11, [x1, #(16*3)] // e.................................................................................|e............................................................................... + // trn1 v25.4s, v8.4s, v9.4s // ...................e..............................................................|...................e............................................................ + // trn2 v26.4s, v8.4s, v9.4s // ................e.................................................................|................e............................................................... + // trn1 v27.4s, v10.4s, v11.4s // .............e....................................................................|.............e.................................................................. + // trn2 v28.4s, v10.4s, v11.4s // ..............e...................................................................|..............e................................................................. + // trn2 v10.2d, v25.2d, v27.2d // .........................e........................................................|.........................e...................................................... + // trn2 v11.2d, v26.2d, v28.2d // ......................e...........................................................|......................e......................................................... + // trn1 v8.2d, v25.2d, v27.2d // ........................e.........................................................|........................e....................................................... + // trn1 v9.2d, v26.2d, v28.2d // .....................e............................................................|.....................e.......................................................... + // ldr q0, [x4], #(6*16) // .......................e..........................................................|.......................e........................................................ + // ldr q4, [x4, #(-6*16 + 1*16)] // .................e................................................................|.................e.............................................................. + // ldr q1, [x4, #(-6*16 + 2*16)] // ..........e.......................................................................|..........e..................................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ...............e..................................................................|...............e................................................................ + // ldr q2, [x4, #(-6*16 + 4*16)] // .......e..........................................................................|.......e........................................................................ + // ldr q6, [x4, #(-6*16 + 5*16)] // ..........................e.......................................................|..........................e..................................................... + // sub v24.8h, v8.8h, v9.8h // .............................e....................................................|.............................e.................................................. + // add v8.8h, v8.8h, v9.8h // ...............................e..................................................|...............................e................................................ + // mul v9.8h, v24.8h, v1.8h // ..................................e...............................................|..................................e............................................. + // sqrdmulh v24.8h, v24.8h, v5.8h // .......................................e..........................................|.......................................e........................................ + // mls v9.8h, v24.8h, v7.h[0] // ..............................................e...................................|..............................................e................................. + // sub v24.8h, v10.8h, v11.8h // ..............................e...................................................|..............................e................................................. + // add v10.8h, v10.8h, v11.8h // .............................................e....................................|.............................................e.................................. + // mul v11.8h, v24.8h, v2.8h // ...................................e..............................................|...................................e............................................ + // sqrdmulh v24.8h, v24.8h, v6.8h // .................................e................................................|.................................e.............................................. + // mls v11.8h, v24.8h, v7.h[0] // ...............................................e..................................|...............................................e................................ + // sub v24.8h, v8.8h, v10.8h // ................................................e.................................|................................................e............................... + // add v8.8h, v8.8h, v10.8h // ...........................................................e......................|...........................................................e.................... + // mul v10.8h, v24.8h, v0.8h // ......................................................e...........................|......................................................e......................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ....................................................e.............................|....................................................e........................... + // mls v10.8h, v24.8h, v7.h[0] // ...............................................................e..................|...............................................................e................ + // sub v24.8h, v9.8h, v11.8h // .....................................................e............................|.....................................................e.......................... + // add v9.8h, v9.8h, v11.8h // ............................................................e.....................|............................................................e................... + // mul v11.8h, v24.8h, v0.8h // .........................................................e........................|.........................................................e...................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ........................................................e.........................|........................................................e....................... + // mls v11.8h, v24.8h, v7.h[0] // ..............................................................e...................|..............................................................e................. + // trn1 v25.4s, v8.4s, v9.4s // ...................................................................e..............|...................................................................e............ + // trn2 v26.4s, v8.4s, v9.4s // .....................................................................e............|.....................................................................e.......... + // trn1 v27.4s, v10.4s, v11.4s // .......................................................................e..........|.......................................................................e........ + // trn2 v28.4s, v10.4s, v11.4s // ......................................................................e...........|......................................................................e......... + // trn2 v10.2d, v25.2d, v27.2d // ..........................................................................e.......|..........................................................................e..... + // trn2 v11.2d, v26.2d, v28.2d // ...........................................................................e......|...........................................................................e.... + // trn1 v8.2d, v25.2d, v27.2d // .............................................................................e....|.............................................................................e.. + // trn1 v9.2d, v26.2d, v28.2d // ............................................................................e.....|............................................................................e... + // ldr q0, [x3], #16 // ....................................................................e.............|....................................................................e........... + // sub v24.8h, v8.8h, v9.8h // .*................................................................................|.*.............................................................................. + // add v8.8h, v8.8h, v9.8h // ..................................................................................*................................................................................ + // mul v9.8h, v24.8h, v0.h[2] // .........*........................................................................|.........*...................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ............*.....................................................................|............*................................................................... + // mls v9.8h, v24.8h, v7.h[0] // ................................*.................................................|................................*............................................... + // sub v24.8h, v10.8h, v11.8h // .................................................................................e|................................................................................ + // add v10.8h, v10.8h, v11.8h // ................................................................................e.|................................................................................ + // mul v11.8h, v24.8h, v0.h[4] // ......*...........................................................................|......*......................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...*..............................................................................|...*............................................................................ + // mls v11.8h, v24.8h, v7.h[0] // ....................................*.............................................|....................................*........................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...........*......................................................................|...........*.................................................................... + // srshr v25.8h, v25.8h, #11 // ....................*.............................................................|....................*........................................................... + // mls v8.8h, v25.8h, v7.h[0] // ............................*.....................................................|............................*................................................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ........*.........................................................................|........*....................................................................... + // srshr v25.8h, v25.8h, #11 // ..................*...............................................................|..................*............................................................. + // mls v10.8h, v25.8h, v7.h[0] // ...........................*......................................................|...........................*.................................................... + // sqdmulh v25.8h, v9.8h, v7.h[1] // .....................................*............................................|.....................................*.......................................... + // srshr v25.8h, v25.8h, #11 // ...........................................*......................................|...........................................*.................................... + // mls v9.8h, v25.8h, v7.h[0] // ..................................................*...............................|..................................................*............................. + // sqdmulh v25.8h, v11.8h, v7.h[1] // ..........................................*.......................................|..........................................*..................................... + // srshr v25.8h, v25.8h, #11 // .................................................*................................|.................................................*.............................. + // mls v11.8h, v25.8h, v7.h[0] // .......................................................*..........................|.......................................................*........................ + // sub v24.8h, v8.8h, v10.8h // ......................................*...........................................|......................................*......................................... + // add v8.8h, v8.8h, v10.8h // ........................................*.........................................|........................................*....................................... + // mul v10.8h, v24.8h, v0.h[0] // ...................................................*..............................|...................................................*............................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .........................................*........................................|.........................................*...................................... + // mls v10.8h, v24.8h, v7.h[0] // ..........................................................*.......................|..........................................................*..................... + // sub v24.8h, v9.8h, v11.8h // .............................................................*....................|.............................................................*.................. + // add v9.8h, v9.8h, v11.8h // ........................................................................*.........|........................................................................*....... + // mul v11.8h, v24.8h, v0.h[0] // .................................................................*................|.................................................................*.............. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..................................................................*...............|..................................................................*............. + // mls v11.8h, v24.8h, v7.h[0] // .........................................................................*........|.........................................................................*...... + // str q8, [x1], #(64) // ............................................*.....................................|............................................*................................... + // str q9, [x1, #(-64 + 16*1)] // ..............................................................................*...|..............................................................................*. + // str q10, [x1, #(-64 + 16*2)] // ................................................................*.................|................................................................*............... + // str q11, [x1, #(-64 + 16*3)] // ...............................................................................*..|...............................................................................* sub count, count, #1 cbnz count, layer4567_start - sub v0.8H, v20.8H, v21.8H // *........................... - add v2.8H, v20.8H, v21.8H // .*.......................... - // gap // ............................ - // gap // ............................ - sqdmulh v16.8H, v4.8H, v7.H[1] // ...*........................ - sqrdmulh v23.8H, v26.8H, v15.H[3] // ..*......................... - // gap // ............................ - // gap // ............................ - mul v21.8H, v26.8H, v15.H[2] // .....*...................... - sqdmulh v26.8H, v2.8H, v7.H[1] // .......*.................... - // gap // ............................ - // gap // ............................ - mul v20.8H, v0.8H, v15.H[4] // ....*....................... - sqrdmulh v0.8H, v0.8H, v15.H[5] // ......*..................... - // gap // ............................ - // gap // ............................ - srshr v16.8H, v16.8H, #11 // ........*................... - // gap // ............................ - // gap // ............................ - // gap // ............................ - mls v21.8H, v23.8H, v7.H[0] // ............*............... - srshr v23.8H, v26.8H, #11 // .........*.................. - // gap // ............................ - // gap // ............................ - mls v20.8H, v0.8H, v7.H[0] // ................*........... - // gap // ............................ - // gap // ............................ - // gap // ............................ - mls v4.8H, v16.8H, v7.H[0] // ..........*................. - // gap // ............................ - // gap // ............................ - // gap // ............................ - mls v2.8H, v23.8H, v7.H[0] // ...........*................ - // gap // ............................ - // gap // ............................ - // gap // ............................ - sub v16.8H, v21.8H, v20.8H // ....................*....... - add v0.8H, v21.8H, v20.8H // ........................*... - // gap // ............................ - // gap // ............................ - // gap // ............................ - // gap // ............................ - // gap // ............................ - // gap // ............................ - sub v23.8H, v4.8H, v2.8H // ..............*............. - add v2.8H, v4.8H, v2.8H // .............*.............. - str q0, [x1, #16] // ..........................*. - // gap // ............................ - mul v0.8H, v16.8H, v15.H[0] // .....................*...... - sqrdmulh v16.8H, v16.8H, v15.H[1] // ......................*..... - // gap // ............................ - // gap // ............................ - mul v21.8H, v23.8H, v15.H[0] // .................*.......... - sqrdmulh v23.8H, v23.8H, v15.H[1] // ..................*......... - str q2, [x1], #(64) // ...............*............ - // gap // ............................ - // gap // ............................ - // gap // ............................ - // gap // ............................ - // gap // ............................ - mls v0.8H, v16.8H, v7.H[0] // .........................*.. - // gap // ............................ - // gap // ............................ - // gap // ............................ - mls v21.8H, v23.8H, v7.H[0] // ...................*........ - // gap // ............................ - // gap // ............................ - // gap // ............................ - // gap // ............................ - // gap // ............................ - // gap // ............................ - // gap // ............................ - str q0, [x1, #-16] // ...........................* - // gap // ............................ - // gap // ............................ - // gap // ............................ - str q21, [x1, #-32] // .......................*.... - // gap // ............................ - // gap // ............................ - // gap // ............................ + sub v27.8H, v28.8H, v22.8H // .*................................ + // gap // .................................. + mul v13.8H, v19.8H, v6.H[4] // ...*.............................. + // gap // .................................. + sqrdmulh v19.8H, v19.8H, v6.H[5] // ..*............................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + mul v1.8H, v27.8H, v6.H[2] // .....*............................ + sqrdmulh v5.8H, v27.8H, v6.H[3] // .......*.......................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + add v28.8H, v28.8H, v22.8H // *................................. + // gap // .................................. + // gap // .................................. + mls v13.8H, v19.8H, v7.H[0] // .............*.................... + // gap // .................................. + sqdmulh v26.8H, v3.8H, v7.H[1] // ....*............................. + mls v1.8H, v5.8H, v7.H[0] // ............*..................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sqdmulh v22.8H, v28.8H, v7.H[1] // ......*........................... + // gap // .................................. + // gap // .................................. + sqdmulh v31.8H, v13.8H, v7.H[1] // ..................*............... + // gap // .................................. + srshr v30.8H, v26.8H, #11 // ........*......................... + // gap // .................................. + sqdmulh v19.8H, v1.8H, v7.H[1] // ..............*................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + srshr v0.8H, v22.8H, #11 // .........*........................ + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v3.8H, v30.8H, v7.H[0] // ..........*....................... + srshr v30.8H, v19.8H, #11 // ...................*.............. + srshr v19.8H, v31.8H, #11 // .....................*............ + // gap // .................................. + // gap // .................................. + mls v28.8H, v0.8H, v7.H[0] // ...........*...................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v1.8H, v30.8H, v7.H[0] // ......................*........... + // gap // .................................. + mls v13.8H, v19.8H, v7.H[0] // ........................*......... + // gap // .................................. + sub v19.8H, v28.8H, v3.8H // ...............*.................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sub v23.8H, v1.8H, v13.8H // ..........................*....... + sqrdmulh v30.8H, v19.8H, v6.H[1] // .................*................ + // gap // .................................. + // gap // .................................. + mul v4.8H, v19.8H, v6.H[0] // .......................*.......... + // gap // .................................. + // gap // .................................. + // gap // .................................. + mul v27.8H, v23.8H, v6.H[0] // ............................*..... + sqrdmulh v21.8H, v23.8H, v6.H[1] // .............................*.... + // gap // .................................. + // gap // .................................. + add v3.8H, v28.8H, v3.8H // ................*................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v4.8H, v30.8H, v7.H[0] // .........................*........ + add v20.8H, v1.8H, v13.8H // ..............................*... + // gap // .................................. + // gap // .................................. + mls v27.8H, v21.8H, v7.H[0] // ...............................*.. + str q3, [x1], #(64) // ....................*............. + // gap // .................................. + // gap // .................................. + str q20, [x1, #-48] // ................................*. + // gap // .................................. + // gap // .................................. + // gap // .................................. + str q4, [x1, #-32] // ...........................*...... + // gap // .................................. + // gap // .................................. + // gap // .................................. + str q27, [x1, #-16] // .................................* + // gap // .................................. + // gap // .................................. + // gap // .................................. // original source code - // sub v11.8H, v20.8H, v21.8H // *........................... - // add v30.8H, v20.8H, v21.8H // .*.......................... - // sqrdmulh v17.8H, v26.8H, v15.H[3] // ...*........................ - // sqdmulh v2.8H, v4.8H, v7.H[1] // ..*......................... - // mul v20.8H, v11.8H, v15.H[4] // ......*..................... - // mul v22.8H, v26.8H, v15.H[2] // ....*....................... - // sqrdmulh v21.8H, v11.8H, v15.H[5] // .......*.................... - // sqdmulh v0.8H, v30.8H, v7.H[1] // .....*...................... - // srshr v23.8H, v2.8H, #11 // ........*................... - // srshr v0.8H, v0.8H, #11 // ..........*................. - // mls v4.8H, v23.8H, v7.H[0] // ............*............... - // mls v30.8H, v0.8H, v7.H[0] // .............*.............. - // mls v22.8H, v17.8H, v7.H[0] // .........*.................. - // add v0.8H, v4.8H, v30.8H // .................*.......... - // sub v16.8H, v4.8H, v30.8H // ................*........... - // str q0, [x1], #(64) // .......................*.... - // mls v20.8H, v21.8H, v7.H[0] // ...........*................ - // mul v3.8H, v16.8H, v15.H[0] // .....................*...... - // sqrdmulh v0.8H, v16.8H, v15.H[1] // ......................*..... - // mls v3.8H, v0.8H, v7.H[0] // .........................*.. - // sub v31.8H, v22.8H, v20.8H // ..............*............. - // mul v16.8H, v31.8H, v15.H[0] // ...................*........ - // sqrdmulh v0.8H, v31.8H, v15.H[1] // ....................*....... - // str q3, [x1, #-32] // ...........................* - // add v4.8H, v22.8H, v20.8H // ...............*............ - // mls v16.8H, v0.8H, v7.H[0] // ........................*... - // str q4, [x1, #-48] // ..................*......... - // str q16, [x1, #-16] // ..........................*. + // add v25.8H, v28.8H, v22.8H // .....*............................ + // sub v23.8H, v28.8H, v22.8H // *................................. + // sqrdmulh v20.8H, v19.8H, v6.H[5] // ..*............................... + // mul v24.8H, v19.8H, v6.H[4] // .*................................ + // sqdmulh v22.8H, v3.8H, v7.H[1] // .......*.......................... + // mul v4.8H, v23.8H, v6.H[2] // ...*.............................. + // sqdmulh v28.8H, v25.8H, v7.H[1] // .........*........................ + // sqrdmulh v23.8H, v23.8H, v6.H[3] // ....*............................. + // srshr v22.8H, v22.8H, #11 // ...........*...................... + // srshr v28.8H, v28.8H, #11 // .............*.................... + // mls v3.8H, v22.8H, v7.H[0] // ..............*................... + // mls v25.8H, v28.8H, v7.H[0] // .................*................ + // mls v4.8H, v23.8H, v7.H[0] // ........*......................... + // mls v24.8H, v20.8H, v7.H[0] // ......*........................... + // sqdmulh v23.8H, v4.8H, v7.H[1] // ............*..................... + // sub v22.8H, v25.8H, v3.8H // ....................*............. + // add v27.8H, v25.8H, v3.8H // ..........................*....... + // sqrdmulh v3.8H, v22.8H, v6.H[1] // ......................*........... + // sqdmulh v19.8H, v24.8H, v7.H[1] // ..........*....................... + // srshr v23.8H, v23.8H, #11 // ...............*.................. + // str q27, [x1], #(64) // ..............................*... + // srshr v19.8H, v19.8H, #11 // ................*................. + // mls v4.8H, v23.8H, v7.H[0] // ..................*............... + // mul v27.8H, v22.8H, v6.H[0] // .......................*.......... + // mls v24.8H, v19.8H, v7.H[0] // ...................*.............. + // mls v27.8H, v3.8H, v7.H[0] // ...........................*...... + // sub v19.8H, v4.8H, v24.8H // .....................*............ + // str q27, [x1, #-32] // ................................*. + // mul v23.8H, v19.8H, v6.H[0] // ........................*......... + // sqrdmulh v22.8H, v19.8H, v6.H[1] // .........................*........ + // add v11.8H, v4.8H, v24.8H // ............................*..... + // mls v23.8H, v22.8H, v7.H[0] // .............................*.... + // str q11, [x1, #-48] // ...............................*.. + // str q23, [x1, #-16] // .................................* // --------------------------------------------------------------------- @@ -866,554 +914,526 @@ layer4567_start: .p2align 2 - ldr q28, [x0, #64] // .*...................... - ldr q2, [x0, #0] // *....................... - // gap // ........................ - // gap // ........................ - ldr q17, [x0, #128] // .....*.................. - ldr q19, [x0, #192] // ....*................... - // gap // ........................ - // gap // ........................ - ldr q11, [x0, #448] // ...........*............ - ldr q3, [x0, #320] // ..*..................... - // gap // ........................ - // gap // ........................ - ldr q5, [x0, #384] // ..............*......... - // gap // ........................ - // gap // ........................ - // gap // ........................ - add v12.8H, v2.8H, v28.8H // ...............*........ - sub v2.8H, v2.8H, v28.8H // ......*................. - ldr q15, [x0, #256] // ...*.................... - // gap // ........................ - add v10.8H, v17.8H, v19.8H // ........*............... - sub v31.8H, v17.8H, v19.8H // .........*.............. - // gap // ........................ - // gap // ........................ - sqrdmulh v8.8H, v2.8H, v0.H[7] // .......*................ - mul v14.8H, v2.8H, v0.H[6] // ..........*............. - // gap // ........................ - // gap // ........................ - sqrdmulh v22.8H, v31.8H, v1.H[1] // ................*....... - add v25.8H, v12.8H, v10.8H // ..................*..... - // gap // ........................ - // gap // ........................ - mul v20.8H, v31.8H, v1.H[0] // ............*........... - sub v18.8H, v15.8H, v3.8H // .............*.......... - // gap // ........................ - // gap // ........................ - sqdmulh v2.8H, v25.8H, v7.H[1] // ......................*. - sub v9.8H, v12.8H, v10.8H // .................*...... - // gap // ........................ - // gap // ........................ - sqrdmulh v6.8H, v18.8H, v1.H[3] // .....................*.. - mls v14.8H, v8.8H, v7.H[0] // ...................*.... - // gap // ........................ - // gap // ........................ - mls v20.8H, v22.8H, v7.H[0] // ....................*... - // gap // ........................ - // gap // ........................ - sqrdmulh v8.8H, v9.8H, v0.H[3] // .......................* + ldr q10, [x0, #256] // *.................................... + ldr q23, [x0, #320] // .....*............................... + // gap // ..................................... + // gap // ..................................... + ldr q21, [x0, #384] // ......*.............................. + ldr q3, [x0, #448] // ..*.................................. + // gap // ..................................... + // gap // ..................................... + ldr q6, [x0, #0] // ....*................................ + ldr q9, [x0, #64] // .*................................... + // gap // ..................................... + // gap // ..................................... + ldr q15, [x0, #192] // ...*................................. + ldr q25, [x0, #128] // .......*............................. + // gap // ..................................... + // gap // ..................................... + sub v17.8H, v10.8H, v23.8H // .........*........................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sub v22.8H, v21.8H, v3.8H // ............*........................ + add v19.8H, v21.8H, v3.8H // ..........................*.......... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v27.8H, v17.8H, v1.H[3] // .............*....................... + sub v13.8H, v6.8H, v9.8H // ........*............................ + // gap // ..................................... + // gap // ..................................... + sub v2.8H, v25.8H, v15.8H // ..............*...................... + mul v24.8H, v22.8H, v1.H[4] // ................*.................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v31.8H, v22.8H, v1.H[5] // ...............*..................... + mul v22.8H, v13.8H, v0.H[6] // ...........*......................... + mul v28.8H, v17.8H, v1.H[2] // .................*................... + sqrdmulh v3.8H, v2.8H, v1.H[1] // ...................*................. + // gap // ..................................... + // gap // ..................................... + mul v16.8H, v2.8H, v1.H[0] // ......................*.............. + sqrdmulh v13.8H, v13.8H, v0.H[7] // ..........*.......................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v24.8H, v31.8H, v7.H[0] // ....................*................ + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v28.8H, v27.8H, v7.H[0] // .....................*............... + // gap // ..................................... + // gap // ..................................... + mls v22.8H, v13.8H, v7.H[0] // ..................*.................. + mls v16.8H, v3.8H, v7.H[0] // ........................*............ + // gap // ..................................... + // gap // ..................................... + add v23.8H, v10.8H, v23.8H // .............................*....... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + add v5.8H, v28.8H, v24.8H // .......................*............. + sub v12.8H, v28.8H, v24.8H // .........................*........... + // gap // ..................................... + // gap // ..................................... + add v31.8H, v22.8H, v16.8H // ...........................*......... + sub v20.8H, v23.8H, v19.8H // ................................*.... + // gap // ..................................... + // gap // ..................................... + add v13.8H, v23.8H, v19.8H // ..................................*.. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + add v10.8H, v31.8H, v5.8H // ............................*........ + sub v27.8H, v31.8H, v5.8H // .................................*... + // gap // ..................................... + // gap // ..................................... + mul v14.8H, v12.8H, v0.H[4] // ....................................* + // gap // ..................................... + // gap // ..................................... + sqrdmulh v11.8H, v20.8H, v0.H[5] // ...................................*. + mul v4.8H, v10.8H, v29.8H // ..............................*...... + sqrdmulh v19.8H, v10.8H, v30.8H // ...............................*..... + // gap // ..................................... + // gap // ..................................... // original source code - // ldr q31, [x0, #0] // .*...................... - // ldr q28, [x0, #64] // *....................... - // ldr q3, [x0, #320] // .....*.................. - // ldr q15, [x0, #256] // .........*.............. - // ldr q10, [x0, #192] // ...*.................... - // ldr q21, [x0, #128] // ..*..................... - // sub v11.8H, v31.8H, v28.8H // ........*............... - // sqrdmulh v4.8H, v11.8H, v0.H[7] // ............*........... - // add v27.8H, v21.8H, v10.8H // ..........*............. - // sub v6.8H, v21.8H, v10.8H // ...........*............ - // mul v14.8H, v11.8H, v0.H[6] // .............*.......... - // ldr q11, [x0, #448] // ....*................... - // mul v20.8H, v6.8H, v1.H[0] // ................*....... - // sub v18.8H, v15.8H, v3.8H // .................*...... - // ldr q5, [x0, #384] // ......*................. - // add v12.8H, v31.8H, v28.8H // .......*................ - // sqrdmulh v2.8H, v6.8H, v1.H[1] // ..............*......... - // sub v9.8H, v12.8H, v27.8H // ...................*.... - // add v25.8H, v12.8H, v27.8H // ...............*........ - // mls v14.8H, v4.8H, v7.H[0] // .....................*.. - // mls v20.8H, v2.8H, v7.H[0] // ......................*. - // sqrdmulh v6.8H, v18.8H, v1.H[3] // ....................*... - // sqdmulh v2.8H, v25.8H, v7.H[1] // ..................*..... - // sqrdmulh v8.8H, v9.8H, v0.H[3] // .......................* + // ldr q10, [x0, #256] // *.................................... + // ldr q9, [x0, #64] // .....*............................... + // ldr q17, [x0, #448] // ...*................................. + // ldr q15, [x0, #192] // ......*.............................. + // ldr q6, [x0, #0] // ....*................................ + // ldr q28, [x0, #320] // .*................................... + // ldr q2, [x0, #384] // ..*.................................. + // ldr q25, [x0, #128] // .......*............................. + // sub v22.8H, v6.8H, v9.8H // ............*........................ + // sub v4.8H, v10.8H, v28.8H // ........*............................ + // sqrdmulh v8.8H, v22.8H, v0.H[7] // ....................*................ + // mul v22.8H, v22.8H, v0.H[6] // ................*.................... + // sub v23.8H, v2.8H, v17.8H // .........*........................... + // sqrdmulh v12.8H, v4.8H, v1.H[3] // ...........*......................... + // sub v26.8H, v25.8H, v15.8H // .............*....................... + // sqrdmulh v24.8H, v23.8H, v1.H[5] // ...............*..................... + // mul v31.8H, v23.8H, v1.H[4] // ..............*...................... + // mul v21.8H, v4.8H, v1.H[2] // .................*................... + // mls v22.8H, v8.8H, v7.H[0] // .......................*............. + // sqrdmulh v8.8H, v26.8H, v1.H[1] // ..................*.................. + // mls v31.8H, v24.8H, v7.H[0] // .....................*............... + // mls v21.8H, v12.8H, v7.H[0] // ......................*.............. + // mul v16.8H, v26.8H, v1.H[0] // ...................*................. + // add v27.8H, v21.8H, v31.8H // ..........................*.......... + // mls v16.8H, v8.8H, v7.H[0] // ........................*............ + // sub v12.8H, v21.8H, v31.8H // ...........................*......... + // add v31.8H, v2.8H, v17.8H // ..........*.......................... + // add v21.8H, v22.8H, v16.8H // ............................*........ + // add v23.8H, v21.8H, v27.8H // ...............................*..... + // add v26.8H, v10.8H, v28.8H // .........................*........... + // mul v4.8H, v23.8H, v29.8H // ...................................*. + // sqrdmulh v19.8H, v23.8H, v30.8H // ....................................* + // sub v20.8H, v26.8H, v31.8H // .............................*....... + // sub v27.8H, v21.8H, v27.8H // ................................*.... + // add v13.8H, v26.8H, v31.8H // ..............................*...... + // sqrdmulh v11.8H, v20.8H, v0.H[5] // ..................................*.. + // mul v14.8H, v12.8H, v0.H[4] // .................................*... sub count, count, #1 layer123_start: - add v17.8H, v15.8H, v3.8H // ...................*.......................................................................... - mul v22.8H, v18.8H, v1.H[2] // ....................*......................................................................... - ldr q31, [x0, #16] // e............................................................................................. - ldr q28, [x0, #80] // .e............................................................................................ - add v23.8H, v5.8H, v11.8H // ........................*..................................................................... - ldr q3, [x0, #336] // .....e........................................................................................ - ldr q15, [x0, #272] // ....e......................................................................................... - sub v18.8H, v14.8H, v20.8H // .................................*............................................................ - ldr q10, [x0, #208] // ...e.......................................................................................... - ldr q21, [x0, #144] // ..e........................................................................................... - sub v4.8H, v5.8H, v11.8H // .......................*...................................................................... - mul v19.8H, v9.8H, v0.H[2] // ..............................*............................................................... - mls v22.8H, v6.8H, v7.H[0] // ......................*....................................................................... - add v6.8H, v17.8H, v23.8H // .......................................*...................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v11.8H, v31.8H, v28.8H // ........e..................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v24.8H, v18.8H, v0.H[3] // ....................................*......................................................... - sub v27.8H, v17.8H, v23.8H // ......................................*....................................................... - sqdmulh v16.8H, v6.8H, v7.H[1] // ...................................................*.......................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v13.8H, v4.8H, v1.H[4] // .........................*.................................................................... - sqrdmulh v26.8H, v4.8H, v1.H[5] // ..........................*................................................................... - mul v17.8H, v18.8H, v0.H[2] // ...................................*.......................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v12.8H, v27.8H, v0.H[4] // ........................................*..................................................... - srshr v16.8H, v16.8H, #11 // ....................................................*......................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - srshr v2.8H, v2.8H, #11 // .................................................*............................................ - mls v13.8H, v26.8H, v7.H[0] // ...........................*.................................................................. - sqrdmulh v23.8H, v27.8H, v0.H[5] // .........................................*.................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v19.8H, v8.8H, v7.H[0] // ................................*............................................................. - sqrdmulh v4.8H, v11.8H, v0.H[7] // ...........e.................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v25.8H, v2.8H, v7.H[0] // ..................................................*........................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v6.8H, v16.8H, v7.H[0] // .....................................................*........................................ - mls v12.8H, v23.8H, v7.H[0] // ..........................................*................................................... - mls v17.8H, v24.8H, v7.H[0] // .....................................*........................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - add v26.8H, v22.8H, v13.8H // ............................................*................................................. - add v27.8H, v21.8H, v10.8H // ..............e............................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v18.8H, v25.8H, v6.8H // ......................................................*....................................... - add v25.8H, v25.8H, v6.8H // .......................................................*...................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - add v8.8H, v19.8H, v12.8H // .................................................................*............................ - sub v23.8H, v19.8H, v12.8H // ................................................................*............................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v5.8H, v18.8H, v0.H[0] // ........................................................*..................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v2.8H, v25.8H, v30.8H // ...............................................................................*.............. - mul v25.8H, v25.8H, v29.8H // ..............................................................................*............... - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v16.8H, v8.8H, v30.8H // .....................................................................................*........ - sub v6.8H, v21.8H, v10.8H // .............e................................................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v9.8H, v23.8H, v0.H[1] // ...................................................................*.......................... - add v24.8H, v14.8H, v20.8H // ..................................*........................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v19.8H, v8.8H, v29.8H // ....................................................................................*......... - mls v25.8H, v2.8H, v7.H[0] // ................................................................................*............. - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v12.8H, v22.8H, v13.8H // ...........................................*.................................................. - sqrdmulh v13.8H, v18.8H, v0.H[1] // .........................................................*.................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v22.8H, v24.8H, v26.8H // ...........................................................*.................................. - sqrdmulh v20.8H, v12.8H, v0.H[5] // ..............................................*............................................... - mul v21.8H, v12.8H, v0.H[4] // .............................................*................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v10.8H, v22.8H, v0.H[1] // ..............................................................*............................... - str q25, [x0], #(16) // ..........................................................................................*... - mls v19.8H, v16.8H, v7.H[0] // ......................................................................................*....... - // gap // .............................................................................................. - add v2.8H, v24.8H, v26.8H // ............................................................*................................. - mul v14.8H, v11.8H, v0.H[6] // ..........e................................................................................... - ldr q11, [x0, #448] // .......e...................................................................................... - // gap // .............................................................................................. - mul v25.8H, v22.8H, v0.H[0] // .............................................................*................................ - mls v21.8H, v20.8H, v7.H[0] // ...............................................*.............................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v24.8H, v2.8H, v30.8H // ..................................................................................*........... - str q19, [x0, #112] // ............................................................................................*. - mul v20.8H, v6.8H, v1.H[0] // ...............e.............................................................................. - // gap // .............................................................................................. - sub v18.8H, v15.8H, v3.8H // ..................e........................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v5.8H, v13.8H, v7.H[0] // ..........................................................*................................... - add v26.8H, v17.8H, v21.8H // ......................................................................*....................... - sub v13.8H, v17.8H, v21.8H // .....................................................................*........................ - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v17.8H, v2.8H, v29.8H // .................................................................................*............ - mls v25.8H, v10.8H, v7.H[0] // ...............................................................*.............................. - // gap // .............................................................................................. - // gap // .............................................................................................. - str q5, [x0, #240] // ..........................................................................*................... - ldr q5, [x0, #384] // ......e....................................................................................... - mul v10.8H, v13.8H, v0.H[0] // .......................................................................*...................... - sqrdmulh v2.8H, v13.8H, v0.H[1] // ........................................................................*..................... - mul v16.8H, v26.8H, v29.8H // .......................................................................................*...... - mul v13.8H, v23.8H, v0.H[0] // ..................................................................*........................... - // gap // .............................................................................................. - // gap // .............................................................................................. - str q25, [x0, #304] // ...........................................................................*.................. - sqrdmulh v22.8H, v26.8H, v30.8H // ........................................................................................*..... - mls v17.8H, v24.8H, v7.H[0] // ...................................................................................*.......... - // gap // .............................................................................................. - add v12.8H, v31.8H, v28.8H // .........e.................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v10.8H, v2.8H, v7.H[0] // .........................................................................*.................... - sqrdmulh v2.8H, v6.8H, v1.H[1] // ................e............................................................................. - mls v13.8H, v9.8H, v7.H[0] // ....................................................................*......................... - // gap // .............................................................................................. - // gap // .............................................................................................. - str q17, [x0, #48] // ...........................................................................................*.. - // gap // .............................................................................................. - sub v9.8H, v12.8H, v27.8H // ............................e................................................................. - mls v16.8H, v22.8H, v7.H[0] // .........................................................................................*.... - add v25.8H, v12.8H, v27.8H // .............................e................................................................ - str q10, [x0, #432] // .............................................................................*................ - mls v14.8H, v4.8H, v7.H[0] // ............e................................................................................. - // gap // .............................................................................................. - mls v20.8H, v2.8H, v7.H[0] // .................e............................................................................ - str q13, [x0, #368] // ............................................................................*................. - sqrdmulh v6.8H, v18.8H, v1.H[3] // .....................e........................................................................ - // gap // .............................................................................................. - str q16, [x0, #176] // .............................................................................................* - sqdmulh v2.8H, v25.8H, v7.H[1] // ................................................e............................................. - // gap // .............................................................................................. - sqrdmulh v8.8H, v9.8H, v0.H[3] // ...............................e.............................................................. + ldr q10, [x0, #272] // ....e................................................................................... + sub v3.8H, v22.8H, v16.8H // .................................*...................................................... + add v22.8H, v6.8H, v9.8H // .........*.............................................................................. + ldr q9, [x0, #80] // .e...................................................................................... + ldr q17, [x0, #464] // .......e................................................................................ + add v18.8H, v25.8H, v15.8H // ..............*......................................................................... + ldr q15, [x0, #208] // ...e.................................................................................... + sqrdmulh v5.8H, v12.8H, v0.H[5] // ..............................................*......................................... + ldr q6, [x0, #16] // e....................................................................................... + mls v4.8H, v19.8H, v7.H[0] // .............................................................................*.......... + sqrdmulh v19.8H, v27.8H, v0.H[1] // ........................................................*............................... + ldr q28, [x0, #336] // .....e.................................................................................. + add v16.8H, v22.8H, v18.8H // .............................*.......................................................... + mul v23.8H, v27.8H, v0.H[0] // .......................................................*................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v14.8H, v5.8H, v7.H[0] // ...............................................*........................................ + sub v27.8H, v22.8H, v18.8H // ............................*........................................................... + // gap // ........................................................................................ + ldr q2, [x0, #400] // ......e................................................................................. + mul v5.8H, v20.8H, v0.H[4] // ........................................*............................................... + ldr q25, [x0, #144] // ..e..................................................................................... + mul v18.8H, v3.8H, v0.H[2] // ...................................*.................................................... + str q4, [x0, #64] // .....................................................................................*.. + sqrdmulh v3.8H, v3.8H, v0.H[3] // ....................................*................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v22.8H, v6.8H, v9.8H // ........e............................................................................... + sub v4.8H, v10.8H, v28.8H // ..................e..................................................................... + mls v23.8H, v19.8H, v7.H[0] // .........................................................*.............................. + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v5.8H, v11.8H, v7.H[0] // ..........................................*............................................. + sqrdmulh v8.8H, v22.8H, v0.H[7] // ...........e............................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v18.8H, v3.8H, v7.H[0] // .....................................*.................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v20.8H, v27.8H, v0.H[3] // ...............................*........................................................ + str q23, [x0, #320] // .....................................................................*.................. + sub v19.8H, v16.8H, v13.8H // ................................................*....................................... + mul v22.8H, v22.8H, v0.H[6] // ..........e............................................................................. + // gap // ........................................................................................ + sub v23.8H, v2.8H, v17.8H // .......................e................................................................ + // gap // ........................................................................................ + mul v3.8H, v27.8H, v0.H[2] // ..............................*......................................................... + // gap // ........................................................................................ + sqrdmulh v12.8H, v4.8H, v1.H[3] // .....................e.................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v26.8H, v25.8H, v15.8H // .............e.......................................................................... + sqrdmulh v24.8H, v23.8H, v1.H[5] // ..........................e............................................................. + mul v31.8H, v23.8H, v1.H[4] // .........................e.............................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v3.8H, v20.8H, v7.H[0] // ................................*....................................................... + mul v21.8H, v4.8H, v1.H[2] // ....................e................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + add v11.8H, v16.8H, v13.8H // .................................................*...................................... + mls v22.8H, v8.8H, v7.H[0] // ............e........................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v8.8H, v26.8H, v1.H[1] // ................e....................................................................... + // gap // ........................................................................................ + mls v31.8H, v24.8H, v7.H[0] // ...........................e............................................................ + mls v21.8H, v12.8H, v7.H[0] // ......................e................................................................. + mul v16.8H, v26.8H, v1.H[0] // ...............e........................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v23.8H, v19.8H, v0.H[1] // ...................................................*.................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v4.8H, v19.8H, v0.H[0] // ..................................................*..................................... + // gap // ........................................................................................ + sub v19.8H, v3.8H, v5.8H // ..........................................................*............................. + add v3.8H, v3.8H, v5.8H // ...........................................................*............................ + // gap // ........................................................................................ + add v27.8H, v21.8H, v31.8H // ............................................e........................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v16.8H, v8.8H, v7.H[0] // .................e...................................................................... + sub v24.8H, v18.8H, v14.8H // ...............................................................*........................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v8.8H, v19.8H, v0.H[0] // ............................................................*........................... + add v5.8H, v18.8H, v14.8H // ................................................................*....................... + // gap // ........................................................................................ + sub v12.8H, v21.8H, v31.8H // ...........................................e............................................ + // gap // ........................................................................................ + add v31.8H, v2.8H, v17.8H // ........................e............................................................... + // gap // ........................................................................................ + mls v4.8H, v23.8H, v7.H[0] // ....................................................*................................... + // gap // ........................................................................................ + sqrdmulh v17.8H, v5.8H, v30.8H // ..................................................................................*..... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v2.8H, v5.8H, v29.8H // .................................................................................*...... + add v21.8H, v22.8H, v16.8H // ..................................e..................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v14.8H, v3.8H, v29.8H // ..............................................................................*......... + sqrdmulh v5.8H, v19.8H, v0.H[1] // .............................................................*.......................... + str q4, [x0, #256] // ....................................................................*................... + sqrdmulh v20.8H, v24.8H, v0.H[1] // ..................................................................*..................... + // gap // ........................................................................................ + mul v13.8H, v24.8H, v0.H[0] // .................................................................*...................... + mul v24.8H, v11.8H, v29.8H // ........................................................................*............... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v19.8H, v11.8H, v30.8H // .........................................................................*.............. + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v2.8H, v17.8H, v7.H[0] // ...................................................................................*.... + // gap // ........................................................................................ + sqrdmulh v3.8H, v3.8H, v30.8H // ...............................................................................*........ + mls v8.8H, v5.8H, v7.H[0] // ..............................................................*......................... + // gap // ........................................................................................ + add v23.8H, v21.8H, v27.8H // ......................................................e................................. + // gap // ........................................................................................ + mls v13.8H, v20.8H, v7.H[0] // ...................................................................*.................... + // gap // ........................................................................................ + mls v24.8H, v19.8H, v7.H[0] // ..........................................................................*............. + str q2, [x0, #192] // .......................................................................................* + add v26.8H, v10.8H, v28.8H // ...................e.................................................................... + // gap // ........................................................................................ + str q8, [x0, #384] // ......................................................................*................. + mul v4.8H, v23.8H, v29.8H // ...........................................................................e............ + mls v14.8H, v3.8H, v7.H[0] // ................................................................................*....... + // gap // ........................................................................................ + sqrdmulh v19.8H, v23.8H, v30.8H // ............................................................................e........... + sub v20.8H, v26.8H, v31.8H // ......................................e................................................. + // gap // ........................................................................................ + str q13, [x0, #448] // .......................................................................*................ + sub v27.8H, v21.8H, v27.8H // .....................................................e.................................. + add v13.8H, v26.8H, v31.8H // .......................................e................................................ + str q24, [x0], #(16) // ....................................................................................*... + // gap // ........................................................................................ + sqrdmulh v11.8H, v20.8H, v0.H[5] // .........................................e.............................................. + str q14, [x0, #112] // ......................................................................................*. + mul v14.8H, v12.8H, v0.H[4] // .............................................e.......................................... + // gap // ........................................................................................ // original source code - // ldr q8, [x0, #0] // e...........................................................................................|.e......................................................................................... - // ldr q9, [x0, #(1*(512/8))] // .e..........................................................................................|..e........................................................................................ - // ldr q10, [x0, #(2*(512/8))] // .......e....................................................................................|........e.................................................................................. - // ldr q11, [x0, #(3*(512/8))] // ......e.....................................................................................|.......e................................................................................... - // ldr q12, [x0, #(4*(512/8))] // ....e.......................................................................................|.....e..................................................................................... - // ldr q13, [x0, #(5*(512/8))] // ...e........................................................................................|....e...................................................................................... - // ldr q14, [x0, #(6*(512/8))] // ....................................................................e.......................|.....................................................................e..................... - // ldr q15, [x0, #(7*(512/8))] // .......................................................e....................................|........................................................e.................................. - // sub v24.8h, v8.8h, v9.8h // ............e...............................................................................|.............e............................................................................. - // add v8.8h, v8.8h, v9.8h // ............................................................................e...............|.............................................................................e............. - // mul v9.8h, v24.8h, v0.h[6] // ......................................................e.....................................|.......................................................e................................... - // sqrdmulh v24.8h, v24.8h, v0.h[7] // .........................e..................................................................|..........................e................................................................ - // mls v9.8h, v24.8h, v7.h[0] // .....................................................................................e......|......................................................................................e.... - // sub v24.8h, v10.8h, v11.8h // ........................................e...................................................|.........................................e................................................. - // add v10.8h, v10.8h, v11.8h // ...............................e............................................................|................................e.......................................................... - // mul v11.8h, v24.8h, v1.h[0] // ............................................................e...............................|.............................................................e............................. - // sqrdmulh v24.8h, v24.8h, v1.h[1] // ..............................................................................e.............|...............................................................................e........... - // mls v11.8h, v24.8h, v7.h[0] // ......................................................................................e.....|.......................................................................................e... - // sub v24.8h, v12.8h, v13.8h // .............................................................e..............................|..............................................................e............................ - // add v12.8h, v12.8h, v13.8h // ............................................................................................*........................................................................................... - // mul v13.8h, v24.8h, v1.h[2] // ............................................................................................|*.......................................................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[3] // ........................................................................................e...|.........................................................................................e. - // mls v13.8h, v24.8h, v7.h[0] // ..........*.................................................................................|...........*............................................................................... - // sub v24.8h, v14.8h, v15.8h // ........*...................................................................................|.........*................................................................................. - // add v14.8h, v14.8h, v15.8h // ..*.........................................................................................|...*....................................................................................... - // mul v15.8h, v24.8h, v1.h[4] // ................*...........................................................................|.................*......................................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[5] // .................*..........................................................................|..................*........................................................................ - // mls v15.8h, v24.8h, v7.h[0] // ......................*.....................................................................|.......................*................................................................... - // sub v24.8h, v8.8h, v10.8h // .................................................................................e..........|..................................................................................e........ - // add v8.8h, v8.8h, v10.8h // ...................................................................................e........|....................................................................................e...... - // mul v10.8h, v24.8h, v0.h[2] // .........*..................................................................................|..........*................................................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........................................................................................e|........................................................................................... - // mls v10.8h, v24.8h, v7.h[0] // ........................*...................................................................|.........................*................................................................. - // sub v24.8h, v9.8h, v11.8h // .....*......................................................................................|......*.................................................................................... - // add v9.8h, v9.8h, v11.8h // ..........................................*.................................................|...........................................*............................................... - // mul v11.8h, v24.8h, v0.h[2] // ..................*.........................................................................|...................*....................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // .............*..............................................................................|..............*............................................................................ - // mls v11.8h, v24.8h, v7.h[0] // .............................*..............................................................|..............................*............................................................ - // sub v24.8h, v12.8h, v14.8h // ..............*.............................................................................|...............*........................................................................... - // add v12.8h, v12.8h, v14.8h // ...........*................................................................................|............*.............................................................................. - // mul v14.8h, v24.8h, v0.h[4] // ...................*........................................................................|....................*...................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // .......................*....................................................................|........................*.................................................................. - // mls v14.8h, v24.8h, v7.h[0] // ............................*...............................................................|.............................*............................................................. - // sub v24.8h, v13.8h, v15.8h // .............................................*..............................................|..............................................*............................................ - // add v13.8h, v13.8h, v15.8h // ..............................*.............................................................|...............................*........................................................... - // mul v15.8h, v24.8h, v0.h[4] // .................................................*..........................................|..................................................*........................................ - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ................................................*...........................................|.................................................*......................................... - // mls v15.8h, v24.8h, v7.h[0] // .........................................................*..................................|..........................................................*................................ - // sqdmulh v25.8h, v8.8h, v7.h[1] // ..........................................................................................e.|........................................................................................... - // srshr v25.8h, v25.8h, #11 // .....................*......................................................................|......................*.................................................................... - // mls v8.8h, v25.8h, v7.h[0] // ..........................*.................................................................|...........................*............................................................... - // sqdmulh v25.8h, v12.8h, v7.h[1] // ...............*............................................................................|................*.......................................................................... - // srshr v25.8h, v25.8h, #11 // ....................*.......................................................................|.....................*..................................................................... - // mls v12.8h, v25.8h, v7.h[0] // ...........................*................................................................|............................*.............................................................. - // sub v24.8h, v8.8h, v12.8h // ................................*...........................................................|.................................*......................................................... - // add v8.8h, v8.8h, v12.8h // .................................*..........................................................|..................................*........................................................ - // mul v12.8h, v24.8h, v0.h[0] // ....................................*.......................................................|.....................................*..................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................*.............................................|...............................................*........................................... - // mls v12.8h, v24.8h, v7.h[0] // ..............................................................*.............................|...............................................................*........................... - // sub v24.8h, v9.8h, v13.8h // ...............................................*............................................|................................................*.......................................... - // add v9.8h, v9.8h, v13.8h // .....................................................*......................................|......................................................*.................................... - // mul v13.8h, v24.8h, v0.h[0] // ........................................................*...................................|.........................................................*................................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..................................................*.........................................|...................................................*....................................... - // mls v13.8h, v24.8h, v7.h[0] // ..................................................................*.........................|...................................................................*....................... - // sub v24.8h, v10.8h, v14.8h // ...................................*........................................................|....................................*...................................................... - // add v10.8h, v10.8h, v14.8h // ..................................*.........................................................|...................................*....................................................... - // mul v14.8h, v24.8h, v0.h[0] // ........................................................................*...................|.........................................................................*................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .........................................*..................................................|..........................................*................................................ - // mls v14.8h, v24.8h, v7.h[0] // ...............................................................................*............|................................................................................*.......... - // sub v24.8h, v11.8h, v15.8h // ................................................................*...........................|.................................................................*......................... - // add v11.8h, v11.8h, v15.8h // ...............................................................*............................|................................................................*.......................... - // mul v15.8h, v24.8h, v0.h[0] // .....................................................................*......................|......................................................................*.................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ......................................................................*.....................|.......................................................................*................... - // mls v15.8h, v24.8h, v7.h[0] // .............................................................................*..............|..............................................................................*............ - // str q12, [x0, #(4*(512/8))] // ...................................................................*........................|....................................................................*...................... - // str q13, [x0, #(5*(512/8))] // .........................................................................*..................|..........................................................................*................ - // str q14, [x0, #(6*(512/8))] // .......................................................................................*....|........................................................................................*.. - // str q15, [x0, #(7*(512/8))] // ....................................................................................*.......|.....................................................................................*..... - // mul v12.8h, v8.8h, v29.8h // ......................................*.....................................................|.......................................*................................................... - // sqrdmulh v8.8h, v8.8h, v30.8h // .....................................*......................................................|......................................*.................................................... - // mls v12.8h, v8.8h, v7.h[0] // ............................................*...............................................|.............................................*............................................. - // mul v13.8h, v9.8h, v29.8h // .................................................................*..........................|..................................................................*........................ - // sqrdmulh v9.8h, v9.8h, v30.8h // ..........................................................*.................................|...........................................................*............................... - // mls v13.8h, v9.8h, v7.h[0] // ...........................................................................*................|............................................................................*.............. - // mul v14.8h, v10.8h, v29.8h // ...........................................*................................................|............................................*.............................................. - // sqrdmulh v10.8h, v10.8h, v30.8h // .......................................*....................................................|........................................*.................................................. - // mls v14.8h, v10.8h, v7.h[0] // ....................................................*.......................................|.....................................................*..................................... - // mul v15.8h, v11.8h, v29.8h // .......................................................................*....................|........................................................................*.................. - // sqrdmulh v11.8h, v11.8h, v30.8h // ..........................................................................*.................|...........................................................................*............... - // mls v15.8h, v11.8h, v7.h[0] // ..................................................................................*.........|...................................................................................*....... - // str q12, [x0], #(16) // ...................................................*........................................|....................................................*...................................... - // str q13, [x0, #(-16 + 1*(512/8))] // ................................................................................*...........|.................................................................................*......... - // str q14, [x0, #(-16 + 2*(512/8))] // ...........................................................*................................|............................................................*.............................. - // str q15, [x0, #(-16 + 3*(512/8))] // .........................................................................................*..|..........................................................................................* + // ldr q8, [x0, #0] // ........e...............................................................................|.......e.............................................................................. + // ldr q9, [x0, #(1*(512/8))] // ...e....................................................................................|..e................................................................................... + // ldr q10, [x0, #(2*(512/8))] // ..................e.....................................................................|.................e.................................................................... + // ldr q11, [x0, #(3*(512/8))] // ......e.................................................................................|.....e................................................................................ + // ldr q12, [x0, #(4*(512/8))] // e.......................................................................................e...................................................................................... + // ldr q13, [x0, #(5*(512/8))] // ...........e............................................................................|..........e........................................................................... + // ldr q14, [x0, #(6*(512/8))] // ................e.......................................................................|...............e...................................................................... + // ldr q15, [x0, #(7*(512/8))] // ....e...................................................................................|...e.................................................................................. + // sub v24.8h, v8.8h, v9.8h // ......................e.................................................................|.....................e................................................................ + // add v8.8h, v8.8h, v9.8h // ..*.....................................................................................|.*.................................................................................... + // mul v9.8h, v24.8h, v0.h[6] // ...............................e........................................................|..............................e....................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // ..........................e.............................................................|.........................e............................................................ + // mls v9.8h, v24.8h, v7.h[0] // .........................................e..............................................|........................................e............................................. + // sub v24.8h, v10.8h, v11.8h // ...................................e....................................................|..................................e................................................... + // add v10.8h, v10.8h, v11.8h // .....*..................................................................................|....*................................................................................. + // mul v11.8h, v24.8h, v1.h[0] // .............................................e..........................................|............................................e......................................... + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ..........................................e.............................................|.........................................e............................................ + // mls v11.8h, v24.8h, v7.h[0] // ...................................................e....................................|..................................................e................................... + // sub v24.8h, v12.8h, v13.8h // .......................e................................................................|......................e............................................................... + // add v12.8h, v12.8h, v13.8h // ...........................................................................e............|..........................................................................e........... + // mul v13.8h, v24.8h, v1.h[2] // .......................................e................................................|......................................e............................................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // ..................................e.....................................................|.................................e.................................................... + // mls v13.8h, v24.8h, v7.h[0] // ............................................e...........................................|...........................................e.......................................... + // sub v24.8h, v14.8h, v15.8h // ................................e.......................................................|...............................e...................................................... + // add v14.8h, v14.8h, v15.8h // ........................................................e...............................|.......................................................e.............................. + // mul v15.8h, v24.8h, v1.h[4] // .....................................e..................................................|....................................e................................................. + // sqrdmulh v24.8h, v24.8h, v1.h[5] // ....................................e...................................................|...................................e.................................................. + // mls v15.8h, v24.8h, v7.h[0] // ...........................................e............................................|..........................................e........................................... + // sub v24.8h, v8.8h, v10.8h // ...............*........................................................................|..............*....................................................................... + // add v8.8h, v8.8h, v10.8h // ............*...........................................................................|...........*.......................................................................... + // mul v10.8h, v24.8h, v0.h[2] // .................................*......................................................|................................*..................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ............................*...........................................................|...........................*.......................................................... + // mls v10.8h, v24.8h, v7.h[0] // ......................................*.................................................|.....................................*................................................ + // sub v24.8h, v9.8h, v11.8h // .*......................................................................................|*..................................................................................... + // add v9.8h, v9.8h, v11.8h // ............................................................e...........................|...........................................................e.......................... + // mul v11.8h, v24.8h, v0.h[2] // ...................*....................................................................|..................*................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .....................*..................................................................|....................*................................................................. + // mls v11.8h, v24.8h, v7.h[0] // ...........................*............................................................|..........................*........................................................... + // sub v24.8h, v12.8h, v14.8h // ................................................................................e.......|...............................................................................e...... + // add v12.8h, v12.8h, v14.8h // ...................................................................................e....|..................................................................................e... + // mul v14.8h, v24.8h, v0.h[4] // .................*......................................................................|................*..................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .....................................................................................e..|....................................................................................e. + // mls v14.8h, v24.8h, v7.h[0] // .........................*..............................................................|........................*............................................................. + // sub v24.8h, v13.8h, v15.8h // .......................................................e................................|......................................................e............................... + // add v13.8h, v13.8h, v15.8h // ..................................................e.....................................|.................................................e.................................... + // mul v15.8h, v24.8h, v0.h[4] // .......................................................................................e|...................................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .......*................................................................................|......*............................................................................... + // mls v15.8h, v24.8h, v7.h[0] // ..............*.........................................................................|.............*........................................................................ + // sub v24.8h, v8.8h, v12.8h // ..............................*.........................................................|.............................*........................................................ + // add v8.8h, v8.8h, v12.8h // ........................................*...............................................|.......................................*.............................................. + // mul v12.8h, v24.8h, v0.h[0] // ...............................................*........................................|..............................................*....................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................*.........................................|.............................................*........................................ + // mls v12.8h, v24.8h, v7.h[0] // .........................................................*..............................|........................................................*............................. + // sub v24.8h, v9.8h, v13.8h // ..................................................................................e.....|.................................................................................e.... + // add v9.8h, v9.8h, v13.8h // .......................................................................e................|......................................................................e............... + // mul v13.8h, v24.8h, v0.h[0] // .............*..........................................................................|............*......................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........*.............................................................................|.........*............................................................................ + // mls v13.8h, v24.8h, v7.h[0] // ........................*...............................................................|.......................*.............................................................. + // sub v24.8h, v10.8h, v14.8h // ................................................*.......................................|...............................................*...................................... + // add v10.8h, v10.8h, v14.8h // .................................................*......................................|................................................*..................................... + // mul v14.8h, v24.8h, v0.h[0] // .....................................................*..................................|....................................................*................................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................................*.........................|.............................................................*........................ + // mls v14.8h, v24.8h, v7.h[0] // ......................................................................*.................|.....................................................................*................ + // sub v24.8h, v11.8h, v15.8h // ....................................................*...................................|...................................................*.................................. + // add v11.8h, v11.8h, v15.8h // ......................................................*.................................|.....................................................*................................ + // mul v15.8h, v24.8h, v0.h[0] // .................................................................*......................|................................................................*..................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ................................................................*.......................|...............................................................*...................... + // mls v15.8h, v24.8h, v7.h[0] // ........................................................................*...............|.......................................................................*.............. + // str q12, [x0, #(4*(512/8))] // ...............................................................*........................|..............................................................*....................... + // str q13, [x0, #(5*(512/8))] // .............................*..........................................................|............................*......................................................... + // str q14, [x0, #(6*(512/8))] // ............................................................................*...........|...........................................................................*.......... + // str q15, [x0, #(7*(512/8))] // .................................................................................*......|................................................................................*..... + // mul v12.8h, v8.8h, v29.8h // ..................................................................*.....................|.................................................................*.................... + // sqrdmulh v8.8h, v8.8h, v30.8h // ...................................................................*....................|..................................................................*................... + // mls v12.8h, v8.8h, v7.h[0] // .........................................................................*..............|........................................................................*............. + // mul v13.8h, v9.8h, v29.8h // .............................................................................e..........|............................................................................e......... + // sqrdmulh v9.8h, v9.8h, v30.8h // ...............................................................................e........|..............................................................................e....... + // mls v13.8h, v9.8h, v7.h[0] // .........*..............................................................................|........*............................................................................. + // mul v14.8h, v10.8h, v29.8h // .............................................................*..........................|............................................................*......................... + // sqrdmulh v10.8h, v10.8h, v30.8h // .....................................................................*..................|....................................................................*................. + // mls v14.8h, v10.8h, v7.h[0] // ..............................................................................*.........|.............................................................................*........ + // mul v15.8h, v11.8h, v29.8h // ...........................................................*............................|..........................................................*........................... + // sqrdmulh v11.8h, v11.8h, v30.8h // ..........................................................*.............................|.........................................................*............................ + // mls v15.8h, v11.8h, v7.h[0] // ....................................................................*...................|...................................................................*.................. + // str q12, [x0], #(16) // ....................................................................................*...|...................................................................................*.. + // str q13, [x0, #(-16 + 1*(512/8))] // ....................*...................................................................|...................*.................................................................. + // str q14, [x0, #(-16 + 2*(512/8))] // ......................................................................................*.|.....................................................................................* + // str q15, [x0, #(-16 + 3*(512/8))] // ..........................................................................*.............|.........................................................................*............ sub count, count, #1 cbnz count, layer123_start - add v16.8H, v15.8H, v3.8H // *..................................................................... - mul v23.8H, v18.8H, v1.H[2] // .*.................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mul v21.8H, v9.8H, v0.H[2] // .....*................................................................ - srshr v22.8H, v2.8H, #11 // ................*..................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sub v10.8H, v14.8H, v20.8H // ...*.................................................................. - // gap // ...................................................................... - // gap // ...................................................................... - sub v15.8H, v5.8H, v11.8H // ....*................................................................. - add v20.8H, v14.8H, v20.8H // ..................................*................................... - add v18.8H, v5.8H, v11.8H // ..*................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mul v13.8H, v15.8H, v1.H[4] // ...........*.......................................................... - sqrdmulh v26.8H, v15.8H, v1.H[5] // ............*......................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sub v19.8H, v16.8H, v18.8H // .........*............................................................ - add v16.8H, v16.8H, v18.8H // .......*.............................................................. - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v24.8H, v10.8H, v0.H[3] // ........*............................................................. - mls v23.8H, v6.8H, v7.H[0] // ......*............................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v13.8H, v26.8H, v7.H[0] // .................*.................................................... - sqdmulh v4.8H, v16.8H, v7.H[1] // ..........*........................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mul v3.8H, v19.8H, v0.H[4] // ..............*....................................................... - sqrdmulh v2.8H, v19.8H, v0.H[5] // ..................*................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mul v17.8H, v10.8H, v0.H[2] // .............*........................................................ - mls v25.8H, v22.8H, v7.H[0] // ....................*................................................. - // gap // ...................................................................... - // gap // ...................................................................... - srshr v11.8H, v4.8H, #11 // ...............*...................................................... - mls v21.8H, v8.8H, v7.H[0] // ...................*.................................................. - // gap // ...................................................................... - // gap // ...................................................................... - mls v3.8H, v2.8H, v7.H[0] // ......................*............................................... - add v2.8H, v23.8H, v13.8H // ........................*............................................. - // gap // ...................................................................... - // gap // ...................................................................... - sub v23.8H, v23.8H, v13.8H // .....................................*................................ - mls v17.8H, v24.8H, v7.H[0] // .......................*.............................................. - // gap // ...................................................................... - // gap // ...................................................................... - mls v16.8H, v11.8H, v7.H[0] // .....................*................................................ - sub v4.8H, v20.8H, v2.8H // .......................................*.............................. - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v10.8H, v23.8H, v0.H[5] // ........................................*............................. - mul v23.8H, v23.8H, v0.H[4] // .........................................*............................ - // gap // ...................................................................... - // gap // ...................................................................... - add v26.8H, v21.8H, v3.8H // ...........................*.......................................... - add v2.8H, v20.8H, v2.8H // .............................................*........................ - // gap // ...................................................................... - // gap // ...................................................................... - sub v20.8H, v25.8H, v16.8H // .........................*............................................ - add v16.8H, v25.8H, v16.8H // ..........................*........................................... - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v11.8H, v26.8H, v30.8H // ................................*..................................... - mul v26.8H, v26.8H, v29.8H // ...................................*.................................. - // gap // ...................................................................... - // gap // ...................................................................... - mul v13.8H, v20.8H, v0.H[0] // .............................*........................................ - sqrdmulh v20.8H, v20.8H, v0.H[1] // ......................................*............................... - // gap // ...................................................................... - // gap // ...................................................................... - sub v21.8H, v21.8H, v3.8H // ............................*......................................... - sqrdmulh v3.8H, v16.8H, v30.8H // ..............................*....................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v26.8H, v11.8H, v7.H[0] // ............................................*......................... - mul v16.8H, v16.8H, v29.8H // ...............................*...................................... - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v11.8H, v2.8H, v30.8H // ................................................*..................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v13.8H, v20.8H, v7.H[0] // ..................................................*................... - mul v2.8H, v2.8H, v29.8H // .....................................................*................ - mls v23.8H, v10.8H, v7.H[0] // ...............................................*...................... - // gap // ...................................................................... - // gap // ...................................................................... - str q26, [x0, #128] // .................................................*.................... - sqrdmulh v19.8H, v4.8H, v0.H[1] // ..........................................*........................... - // gap // ...................................................................... - mls v16.8H, v3.8H, v7.H[0] // ....................................*................................. - sqrdmulh v25.8H, v21.8H, v0.H[1] // .................................*.................................... - str q13, [x0, #256] // .......................................................*.............. - mul v21.8H, v21.8H, v0.H[0] // ...........................................................*.......... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v2.8H, v11.8H, v7.H[0] // ..............................................................*....... - str q16, [x0], #(16) // ...........................................*.......................... - sub v16.8H, v17.8H, v23.8H // ....................................................*................. - add v23.8H, v17.8H, v23.8H // ...................................................*.................. - // gap // ...................................................................... - mls v21.8H, v25.8H, v7.H[0] // ................................................................*..... - mul v4.8H, v4.8H, v0.H[0] // ..............................................*....................... - // gap // ...................................................................... - // gap // ...................................................................... - mul v20.8H, v23.8H, v29.8H // ..........................................................*........... - sqrdmulh v23.8H, v23.8H, v30.8H // .............................................................*........ - str q2, [x0, #48] // .................................................................*.... - // gap // ...................................................................... - mul v26.8H, v16.8H, v0.H[0] // ........................................................*............. - sqrdmulh v16.8H, v16.8H, v0.H[1] // .........................................................*............ - // gap // ...................................................................... - // gap // ...................................................................... - str q21, [x0, #368] // ....................................................................*. - mls v4.8H, v19.8H, v7.H[0] // ......................................................*............... - // gap // ...................................................................... - // gap // ...................................................................... - mls v20.8H, v23.8H, v7.H[0] // ..................................................................*... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v26.8H, v16.8H, v7.H[0] // ...............................................................*...... - str q4, [x0, #304] // ............................................................*......... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - str q20, [x0, #176] // .....................................................................* - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - str q26, [x0, #432] // ...................................................................*.. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... + sub v23.8H, v22.8H, v16.8H // *.................................................. + add v22.8H, v6.8H, v9.8H // .*................................................. + // gap // ................................................... + // gap // ................................................... + sqrdmulh v24.8H, v12.8H, v0.H[5] // ...*............................................... + mul v3.8H, v20.8H, v0.H[4] // ..........*........................................ + // gap // ................................................... + // gap // ................................................... + add v26.8H, v25.8H, v15.8H // ..*................................................ + sqrdmulh v20.8H, v27.8H, v0.H[1] // .....*............................................. + // gap // ................................................... + // gap // ................................................... + mul v28.8H, v27.8H, v0.H[0] // .......*........................................... + mls v4.8H, v19.8H, v7.H[0] // ....*.............................................. + // gap // ................................................... + // gap // ................................................... + mls v3.8H, v11.8H, v7.H[0] // ...............*................................... + mls v14.8H, v24.8H, v7.H[0] // ........*.......................................... + // gap // ................................................... + // gap // ................................................... + add v19.8H, v22.8H, v26.8H // ......*............................................ + sqrdmulh v27.8H, v23.8H, v0.H[3] // .............*..................................... + // gap // ................................................... + // gap // ................................................... + mul v23.8H, v23.8H, v0.H[2] // ...........*....................................... + sub v22.8H, v22.8H, v26.8H // .........*......................................... + str q4, [x0, #64] // ............*...................................... + // gap // ................................................... + sub v24.8H, v19.8H, v13.8H // ...................*............................... + add v19.8H, v19.8H, v13.8H // ......................*............................ + // gap // ................................................... + // gap // ................................................... + mls v28.8H, v20.8H, v7.H[0] // ..............*.................................... + sqrdmulh v26.8H, v22.8H, v0.H[3] // .................*................................. + // gap // ................................................... + // gap // ................................................... + mls v23.8H, v27.8H, v7.H[0] // ................*.................................. + mul v22.8H, v22.8H, v0.H[2] // ....................*.............................. + // gap // ................................................... + // gap // ................................................... + sqrdmulh v27.8H, v24.8H, v0.H[1] // .......................*........................... + mul v24.8H, v24.8H, v0.H[0] // ........................*.......................... + // gap // ................................................... + // gap // ................................................... + mul v20.8H, v19.8H, v29.8H // ......................................*............ + sqrdmulh v19.8H, v19.8H, v30.8H // .......................................*........... + str q28, [x0, #320] // ..................*................................ + // gap // ................................................... + sub v28.8H, v23.8H, v14.8H // ...........................*....................... + mls v22.8H, v26.8H, v7.H[0] // .....................*............................. + // gap // ................................................... + // gap // ................................................... + add v23.8H, v23.8H, v14.8H // .............................*..................... + mls v24.8H, v27.8H, v7.H[0] // ..............................*.................... + // gap // ................................................... + // gap // ................................................... + sqrdmulh v27.8H, v28.8H, v0.H[1] // ....................................*.............. + mul v28.8H, v28.8H, v0.H[0] // .....................................*............. + // gap // ................................................... + // gap // ................................................... + sub v26.8H, v22.8H, v3.8H // .........................*......................... + add v22.8H, v22.8H, v3.8H // ..........................*........................ + // gap // ................................................... + // gap // ................................................... + sqrdmulh v3.8H, v23.8H, v30.8H // ...............................*................... + mul v23.8H, v23.8H, v29.8H // ................................*.................. + str q24, [x0, #256] // ...................................*............... + // gap // ................................................... + mul v24.8H, v26.8H, v0.H[0] // ............................*...................... + mul v11.8H, v22.8H, v29.8H // .................................*................. + // gap // ................................................... + // gap // ................................................... + mls v28.8H, v27.8H, v7.H[0] // ...........................................*....... + sqrdmulh v27.8H, v26.8H, v0.H[1] // ..................................*................ + // gap // ................................................... + // gap // ................................................... + sqrdmulh v22.8H, v22.8H, v30.8H // .........................................*......... + mls v23.8H, v3.8H, v7.H[0] // ........................................*.......... + // gap // ................................................... + // gap // ................................................... + mls v20.8H, v19.8H, v7.H[0] // ............................................*...... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + mls v24.8H, v27.8H, v7.H[0] // ..........................................*........ + str q28, [x0, #448] // ................................................*.. + // gap // ................................................... + // gap // ................................................... + mls v11.8H, v22.8H, v7.H[0] // ...............................................*... + str q23, [x0, #192] // .............................................*..... + // gap // ................................................... + // gap // ................................................... + str q20, [x0], #(16) // .................................................*. + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + str q24, [x0, #368] // ..............................................*.... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + str q11, [x0, #112] // ..................................................* + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... // original source code - // add v17.8H, v15.8H, v3.8H // *..................................................................... - // mul v22.8H, v18.8H, v1.H[2] // .*.................................................................... - // add v23.8H, v5.8H, v11.8H // .......*.............................................................. - // sub v18.8H, v14.8H, v20.8H // ....*................................................................. - // sub v4.8H, v5.8H, v11.8H // .....*................................................................ - // mul v19.8H, v9.8H, v0.H[2] // ..*................................................................... - // mls v22.8H, v6.8H, v7.H[0] // .............*........................................................ - // add v6.8H, v17.8H, v23.8H // ...........*.......................................................... - // sqrdmulh v24.8H, v18.8H, v0.H[3] // ............*......................................................... - // sub v27.8H, v17.8H, v23.8H // ..........*........................................................... - // sqdmulh v16.8H, v6.8H, v7.H[1] // ...............*...................................................... - // mul v13.8H, v4.8H, v1.H[4] // ........*............................................................. - // sqrdmulh v26.8H, v4.8H, v1.H[5] // .........*............................................................ - // mul v17.8H, v18.8H, v0.H[2] // ..................*................................................... - // mul v12.8H, v27.8H, v0.H[4] // ................*..................................................... - // srshr v16.8H, v16.8H, #11 // ....................*................................................. - // srshr v2.8H, v2.8H, #11 // ...*.................................................................. - // mls v13.8H, v26.8H, v7.H[0] // ..............*....................................................... - // sqrdmulh v23.8H, v27.8H, v0.H[5] // .................*.................................................... - // mls v19.8H, v8.8H, v7.H[0] // .....................*................................................ - // mls v25.8H, v2.8H, v7.H[0] // ...................*.................................................. - // mls v6.8H, v16.8H, v7.H[0] // ..........................*........................................... - // mls v12.8H, v23.8H, v7.H[0] // ......................*............................................... - // mls v17.8H, v24.8H, v7.H[0] // .........................*............................................ - // add v26.8H, v22.8H, v13.8H // .......................*.............................................. - // sub v18.8H, v25.8H, v6.8H // ................................*..................................... - // add v25.8H, v25.8H, v6.8H // .................................*.................................... - // add v8.8H, v19.8H, v12.8H // ..............................*....................................... - // sub v23.8H, v19.8H, v12.8H // ......................................*............................... - // mul v5.8H, v18.8H, v0.H[0] // ....................................*................................. - // sqrdmulh v2.8H, v25.8H, v30.8H // .......................................*.............................. - // mul v25.8H, v25.8H, v29.8H // .........................................*............................ - // sqrdmulh v16.8H, v8.8H, v30.8H // ..................................*................................... - // sqrdmulh v9.8H, v23.8H, v0.H[1] // .................................................*.................... - // add v24.8H, v14.8H, v20.8H // ......*............................................................... - // mul v19.8H, v8.8H, v29.8H // ...................................*.................................. - // mls v25.8H, v2.8H, v7.H[0] // ................................................*..................... - // sub v12.8H, v22.8H, v13.8H // ........................*............................................. - // sqrdmulh v13.8H, v18.8H, v0.H[1] // .....................................*................................ - // sub v22.8H, v24.8H, v26.8H // ...........................*.......................................... - // sqrdmulh v20.8H, v12.8H, v0.H[5] // ............................*......................................... - // mul v21.8H, v12.8H, v0.H[4] // .............................*........................................ - // sqrdmulh v10.8H, v22.8H, v0.H[1] // ...............................................*...................... - // str q25, [x0], #(16) // .....................................................*................ - // mls v19.8H, v16.8H, v7.H[0] // ........................................*............................. - // add v2.8H, v24.8H, v26.8H // ...............................*...................................... - // mul v25.8H, v22.8H, v0.H[0] // .........................................................*............ - // mls v21.8H, v20.8H, v7.H[0] // .............................................*........................ - // sqrdmulh v24.8H, v2.8H, v30.8H // ..........................................*........................... - // str q19, [x0, #112] // ..............................................*....................... - // mls v5.8H, v13.8H, v7.H[0] // ...........................................*.......................... - // add v26.8H, v17.8H, v21.8H // .......................................................*.............. - // sub v13.8H, v17.8H, v21.8H // ......................................................*............... - // mul v17.8H, v2.8H, v29.8H // ............................................*......................... - // mls v25.8H, v10.8H, v7.H[0] // ................................................................*..... - // str q5, [x0, #240] // ..................................................*................... - // mul v10.8H, v13.8H, v0.H[0] // .............................................................*........ - // sqrdmulh v2.8H, v13.8H, v0.H[1] // ..............................................................*....... - // mul v16.8H, v26.8H, v29.8H // ..........................................................*........... - // mul v13.8H, v23.8H, v0.H[0] // ...................................................*.................. - // str q25, [x0, #304] // ...................................................................*.. - // sqrdmulh v22.8H, v26.8H, v30.8H // ...........................................................*.......... - // mls v17.8H, v24.8H, v7.H[0] // ....................................................*................. - // mls v10.8H, v2.8H, v7.H[0] // ..................................................................*... - // mls v13.8H, v9.8H, v7.H[0] // ........................................................*............. - // str q17, [x0, #48] // ............................................................*......... - // mls v16.8H, v22.8H, v7.H[0] // .................................................................*.... - // str q10, [x0, #432] // .....................................................................* - // str q13, [x0, #368] // ...............................................................*...... - // str q16, [x0, #176] // ....................................................................*. + // sub v3.8H, v22.8H, v16.8H // *.................................................. + // add v22.8H, v6.8H, v9.8H // .*................................................. + // add v18.8H, v25.8H, v15.8H // ....*.............................................. + // sqrdmulh v5.8H, v12.8H, v0.H[5] // ..*................................................ + // mls v4.8H, v19.8H, v7.H[0] // .......*........................................... + // sqrdmulh v19.8H, v27.8H, v0.H[1] // .....*............................................. + // add v16.8H, v22.8H, v18.8H // ..........*........................................ + // mul v23.8H, v27.8H, v0.H[0] // ......*............................................ + // mls v14.8H, v5.8H, v7.H[0] // .........*......................................... + // sub v27.8H, v22.8H, v18.8H // .............*..................................... + // mul v5.8H, v20.8H, v0.H[4] // ...*............................................... + // mul v18.8H, v3.8H, v0.H[2] // ............*...................................... + // str q4, [x0, #64] // ..............*.................................... + // sqrdmulh v3.8H, v3.8H, v0.H[3] // ...........*....................................... + // mls v23.8H, v19.8H, v7.H[0] // .................*................................. + // mls v5.8H, v11.8H, v7.H[0] // ........*.......................................... + // mls v18.8H, v3.8H, v7.H[0] // ...................*............................... + // sqrdmulh v20.8H, v27.8H, v0.H[3] // ..................*................................ + // str q23, [x0, #320] // .........................*......................... + // sub v19.8H, v16.8H, v13.8H // ...............*................................... + // mul v3.8H, v27.8H, v0.H[2] // ....................*.............................. + // mls v3.8H, v20.8H, v7.H[0] // ...........................*....................... + // add v11.8H, v16.8H, v13.8H // ................*.................................. + // sqrdmulh v23.8H, v19.8H, v0.H[1] // .....................*............................. + // mul v4.8H, v19.8H, v0.H[0] // ......................*............................ + // sub v19.8H, v3.8H, v5.8H // ................................*.................. + // add v3.8H, v3.8H, v5.8H // .................................*................. + // sub v24.8H, v18.8H, v14.8H // ..........................*........................ + // mul v8.8H, v19.8H, v0.H[0] // .....................................*............. + // add v5.8H, v18.8H, v14.8H // ............................*...................... + // mls v4.8H, v23.8H, v7.H[0] // .............................*..................... + // sqrdmulh v17.8H, v5.8H, v30.8H // ..................................*................ + // mul v2.8H, v5.8H, v29.8H // ...................................*............... + // mul v14.8H, v3.8H, v29.8H // ......................................*............ + // sqrdmulh v5.8H, v19.8H, v0.H[1] // ........................................*.......... + // str q4, [x0, #256] // ....................................*.............. + // sqrdmulh v20.8H, v24.8H, v0.H[1] // ..............................*.................... + // mul v13.8H, v24.8H, v0.H[0] // ...............................*................... + // mul v24.8H, v11.8H, v29.8H // .......................*........................... + // sqrdmulh v19.8H, v11.8H, v30.8H // ........................*.......................... + // mls v2.8H, v17.8H, v7.H[0] // ..........................................*........ + // sqrdmulh v3.8H, v3.8H, v30.8H // .........................................*......... + // mls v8.8H, v5.8H, v7.H[0] // ............................................*...... + // mls v13.8H, v20.8H, v7.H[0] // .......................................*........... + // mls v24.8H, v19.8H, v7.H[0] // ...........................................*....... + // str q2, [x0, #192] // ...............................................*... + // str q8, [x0, #384] // .................................................*. + // mls v14.8H, v3.8H, v7.H[0] // ..............................................*.... + // str q13, [x0, #448] // .............................................*..... + // str q24, [x0], #(16) // ................................................*.. + // str q14, [x0, #112] // ..................................................* pop_stack From 061cefb98b6b18bf4c04894cf4c0cc6a0352a31c Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Wed, 3 Apr 2024 12:32:13 +0200 Subject: [PATCH 16/18] Update tests --- tests/ntt_dilithium/main.c | 6 +- ...t_dilithium_123_45678_manual_ld4_opt_a72.s | 3075 ++++++++------- ...um_123_45678_manual_ld4_opt_m1_firestorm.s | 3472 +++++++++-------- ...ium_123_45678_manual_ld4_opt_m1_icestorm.s | 3030 +++++++------- .../ntt_dilithium_1234_5678_manual_st4.s | 16 +- 5 files changed, 4897 insertions(+), 4702 deletions(-) diff --git a/tests/ntt_dilithium/main.c b/tests/ntt_dilithium/main.c index 5db247d..b6759fb 100644 --- a/tests/ntt_dilithium/main.c +++ b/tests/ntt_dilithium/main.c @@ -597,7 +597,7 @@ int main( void ) if (test_ntt_asm_1234_5678_opt_a72() != 0){return 1;} if (test_ntt_asm_123_45678_inv_opt_a72() != 0){return 1;} - /* if (test_ntt_asm_123_45678_inv_manual_ld4_opt_a72() != 0){return 1;} */ + if (test_ntt_asm_123_45678_inv_manual_ld4_opt_a72() != 0){return 1;} if (test_ntt_asm_1234_5678_inv_opt_a72() != 0){return 1;} if (test_ntt_asm_1234_5678_inv_manual_ld4_opt_a72() != 0){return 1;} @@ -608,7 +608,7 @@ int main( void ) if (test_ntt_asm_1234_5678_manual_st4_opt_m1_firestorm() != 0){return 1;} if (test_ntt_asm_123_45678_inv_opt_m1_firestorm() != 0){return 1;} - /* if (test_ntt_asm_123_45678_inv_manual_ld4_opt_m1_firestorm() != 0){return 1;} */ + if (test_ntt_asm_123_45678_inv_manual_ld4_opt_m1_firestorm() != 0){return 1;} if (test_ntt_asm_1234_5678_inv_opt_m1_firestorm() != 0){return 1;} if (test_ntt_asm_1234_5678_inv_manual_ld4_opt_m1_firestorm() != 0){return 1;} @@ -620,7 +620,7 @@ int main( void ) if (test_ntt_asm_1234_5678_manual_st4_opt_m1_icestorm() != 0){return 1;} if (test_ntt_asm_123_45678_inv_opt_m1_icestorm() != 0){return 1;} - /* if (test_ntt_asm_123_45678_inv_manual_ld4_opt_m1_icestorm() != 0){return 1;} */ + if (test_ntt_asm_123_45678_inv_manual_ld4_opt_m1_icestorm() != 0){return 1;} if (test_ntt_asm_1234_5678_inv_opt_m1_icestorm() != 0){return 1;} if (test_ntt_asm_1234_5678_inv_manual_ld4_opt_m1_icestorm() != 0){return 1;} // other diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a72.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a72.s index 917ba79..b31538c 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a72.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a72.s @@ -67,7 +67,7 @@ xtmp1 .req x11 cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s - vmls \a, \tmp2, modulus + vmls \a, \tmp2, consts .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -76,12 +76,6 @@ xtmp1 .req x11 mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.4s, \a\().4s, \b\().4s add \a\().4s, \a\().4s, \b\().4s @@ -193,7 +187,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -204,7 +198,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -214,7 +208,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -222,7 +216,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -233,19 +227,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -371,8 +365,6 @@ _intt_dilithium_123_45678_manual_ld4_opt_a72: consts .req v8 qform_consts .req q8 - modulus .req v29 - ASM_LOAD(r_ptr0, roots_l345) ASM_LOAD(r_ptr1, roots_l67) @@ -395,1191 +387,1230 @@ _intt_dilithium_123_45678_manual_ld4_opt_a72: qform_root3_tw .req q7 .p2align 2 - // gap // ............................................................................................................................................................ - ldr q26, [x1, #16] // ........*................................................................................................................................................... - ldr q10, [x1, #0] // .........*.................................................................................................................................................. - ldr q17, [x1, #32] // .......*.................................................................................................................................................... - // gap // ............................................................................................................................................................ - ldr q30, [x1, #48] // ......*..................................................................................................................................................... - ldr q22, [x2, #48] // ....................................*....................................................................................................................... - ldr q21, [x5, #96] // .............*.............................................................................................................................................. - // gap // ............................................................................................................................................................ - ldr q12, [x2, #16] // .......................................*.................................................................................................................... - ldr q25, [x2, #0] // ........................................*................................................................................................................... - // gap // ............................................................................................................................................................ - ldr q11, [x2, #32] // .....................................*...................................................................................................................... - trn1 v18.4S, v10.4S, v26.4S // ................*........................................................................................................................................... - trn2 v1.4S, v10.4S, v26.4S // .................*.......................................................................................................................................... - trn2 v5.4S, v17.4S, v30.4S // ...................*........................................................................................................................................ - trn1 v7.4S, v17.4S, v30.4S // ..............*............................................................................................................................................. - ldr q24, [x5, #80] // ............*............................................................................................................................................... - ldr q20, [x5, #32] // ..*......................................................................................................................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - trn2 v16.4S, v25.4S, v12.4S // ............................................*............................................................................................................... - trn1 v9.4S, v25.4S, v12.4S // ..............................................*............................................................................................................. - // gap // ............................................................................................................................................................ - trn2 v0.2D, v18.2D, v7.2D // ......................*..................................................................................................................................... - trn2 v23.2D, v1.2D, v5.2D // ........................*................................................................................................................................... - ldr q29, [x5, #160] // *........................................................................................................................................................... - trn2 v19.4S, v11.4S, v22.4S // .............................................*.............................................................................................................. - trn1 v31.4S, v11.4S, v22.4S // ..........................................*................................................................................................................. - ldr q2, [x5, #48] // ...............*............................................................................................................................................ - trn1 v4.2D, v1.2D, v5.2D // .........................*.................................................................................................................................. - trn1 v3.2D, v18.2D, v7.2D // ..........................*................................................................................................................................. - // gap // ............................................................................................................................................................ - ldr q30, [x5, #64] // .......................*.................................................................................................................................... - sub v5.4S, v0.4S, v23.4S // ...........................*................................................................................................................................ - // gap // ............................................................................................................................................................ - trn1 v15.2D, v16.2D, v19.2D // ..................................................*......................................................................................................... - trn1 v12.2D, v9.2D, v31.2D // ...................................................*........................................................................................................ - // gap // ............................................................................................................................................................ - sub v25.4S, v3.4S, v4.4S // ............................*............................................................................................................................... - trn2 v27.2D, v16.2D, v19.2D // ....................................................*....................................................................................................... - // gap // ............................................................................................................................................................ - trn2 v22.2D, v9.2D, v31.2D // ......................................................*..................................................................................................... - sqrdmulh v18.4S, v5.4S, v24.4S // ..............................*............................................................................................................................. - // gap // ............................................................................................................................................................ - sub v13.4S, v12.4S, v15.4S // ........................................................*................................................................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - add v9.4S, v12.4S, v15.4S // .........................................................*.................................................................................................. - sqrdmulh v7.4S, v25.4S, v2.4S // ................................*........................................................................................................................... - // gap // ............................................................................................................................................................ - add v14.4S, v22.4S, v27.4S // ............................................................*............................................................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mul v30.4S, v5.4S, v30.4S // ..................................*......................................................................................................................... - add v4.4S, v3.4S, v4.4S // ...............................*............................................................................................................................ - // gap // ............................................................................................................................................................ - sub v11.4S, v22.4S, v27.4S // ...........................................................*................................................................................................ - ldr q15, [x5, #112] // .*.......................................................................................................................................................... - // gap // ............................................................................................................................................................ - mul v26.4S, v25.4S, v20.4S // ......................................*..................................................................................................................... - add v31.4S, v0.4S, v23.4S // .............................*.............................................................................................................................. - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v26.4S, v7.4S, v8.S[0] // .........................................*.................................................................................................................. - ldr q25, [x5, #16] // .................................*.......................................................................................................................... - ldr q12, [x5], #(12*16) // ..................*......................................................................................................................................... - sub v20.4S, v4.4S, v31.4S // ...................................*........................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v30.4S, v18.4S, v8.S[0] // ...........................................*................................................................................................................ - sub v23.4S, v9.4S, v14.4S // ................................................................*........................................................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - add v7.4S, v4.4S, v31.4S // ................................................*........................................................................................................... - ldr q3, [x5, #-48] // ....................*....................................................................................................................................... - mul v2.4S, v20.4S, v12.4S // .................................................*.......................................................................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - add v5.4S, v9.4S, v14.4S // .....................................................................................*...................................................................... - sqrdmulh v18.4S, v20.4S, v25.4S // ...............................................*............................................................................................................ - // gap // ............................................................................................................................................................ - sub v10.4S, v26.4S, v30.4S // .....................................................*...................................................................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - sqrdmulh v4.4S, v13.4S, v3.4S // .....................................................................*...................................................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - ldr q14, [x5, #-64] // ..........*................................................................................................................................................. - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - sqrdmulh v19.4S, v10.4S, v25.4S // ..........................................................*................................................................................................. - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v2.4S, v18.4S, v8.S[0] // .......................................................*.................................................................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mul v16.4S, v10.4S, v12.4S // ...............................................................*............................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v16.4S, v19.4S, v8.S[0] // .................................................................*.......................................................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - add v0.4S, v26.4S, v30.4S // ..............................................................*............................................................................................. - ldr q30, [x4, #48] // ...*........................................................................................................................................................ - // gap // ............................................................................................................................................................ - mul v1.4S, v11.4S, v29.4S // ...................................................................*........................................................................................ - ldr q29, [x5, #-16] // ...........*................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - trn2 v28.4S, v7.4S, v0.4S // ....................................................................*....................................................................................... - mul v31.4S, v23.4S, v21.4S // ............................................................................*............................................................................... - // gap // ............................................................................................................................................................ - trn2 v9.4S, v2.4S, v16.4S // ......................................................................*..................................................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mul v18.4S, v13.4S, v14.4S // .............................................................*.............................................................................................. - trn1 v16.4S, v2.4S, v16.4S // ........................................................................*................................................................................... - // gap // ............................................................................................................................................................ - trn1 v24.4S, v7.4S, v0.4S // ..................................................................*......................................................................................... - ldr q0, [x4, #16] // .....*...................................................................................................................................................... - // gap // ............................................................................................................................................................ - sqrdmulh v17.4S, v11.4S, v29.4S // .......................................................................*.................................................................................... - trn1 v12.2D, v28.2D, v9.2D // .........................................................................*.................................................................................. - // gap // ............................................................................................................................................................ - trn2 v20.2D, v28.2D, v9.2D // ...........................................................................*................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - sqrdmulh v6.4S, v23.4S, v15.4S // ..........................................................................*................................................................................. - trn1 v14.2D, v24.2D, v16.2D // ..............................................................................*............................................................................. - // gap // ............................................................................................................................................................ - trn2 v26.2D, v24.2D, v16.2D // .............................................................................*.............................................................................. - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v18.4S, v4.4S, v8.S[0] // .................................................................................*.......................................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - add v7.4S, v14.4S, v12.4S // ..................................................................................*......................................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v1.4S, v17.4S, v8.S[0] // ...............................................................................*............................................................................ - add v9.4S, v26.4S, v20.4S // ................................................................................*........................................................................... - // gap // ............................................................................................................................................................ - sub v23.4S, v14.4S, v12.4S // ...........................................................................................*................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v31.4S, v6.4S, v8.S[0] // ....................................................................................*....................................................................... - ldr q6, [x4], #64 // .....................*...................................................................................................................................... - // gap // ............................................................................................................................................................ - sub v28.4S, v7.4S, v9.4S // ......................................................................................*..................................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - add v29.4S, v7.4S, v9.4S // ............................................................................................*............................................................... - sqrdmulh v16.4S, v23.4S, v0.S[3] // ...................................................................................................*........................................................ - // gap // ............................................................................................................................................................ - sub v3.4S, v18.4S, v1.4S // ........................................................................................*................................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - add v17.4S, v18.4S, v1.4S // .........................................................................................*.................................................................. - mul v2.4S, v23.4S, v0.S[2] // ............................................................................................................*............................................... - // gap // ............................................................................................................................................................ - srshr v27.4S, v29.4S, #23 // ..................................................................................................*......................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - sqrdmulh v23.4S, v3.4S, v15.4S // .............................................................................................*.............................................................. - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - trn2 v1.4S, v5.4S, v17.4S // ................................................................................................*........................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - trn1 v19.4S, v5.4S, v17.4S // ....................................................................................................*....................................................... - sqrdmulh v4.4S, v28.4S, v6.S[3] // ..........................................................................................*................................................................. - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mul v7.4S, v3.4S, v21.4S // ...............................................................................................*............................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v7.4S, v23.4S, v8.S[0] // .................................................................................................*.......................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - sub v10.4S, v26.4S, v20.4S // ...................................................................................*........................................................................ - mul v15.4S, v28.4S, v6.S[2] // .........................................................................................................*.................................................. - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v2.4S, v16.4S, v8.S[0] // ...............................................................................................................*............................................ - ldr q9, [x4, #-32] // ....*....................................................................................................................................................... - // gap // ............................................................................................................................................................ - trn1 v14.4S, v31.4S, v7.4S // ......................................................................................................*..................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - trn2 v16.4S, v31.4S, v7.4S // ........................................................................................................*................................................... - mls v15.4S, v4.4S, v8.S[0] // ..................................................................................................................*......................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - sqrdmulh v4.4S, v10.4S, v9.S[1] // ..............................................................................................*............................................................. - trn1 v23.2D, v19.2D, v14.2D // .............................................................................................................*.............................................. - // gap // ............................................................................................................................................................ - trn2 v3.2D, v19.2D, v14.2D // ..........................................................................................................*................................................. - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v29.4S, v27.4S, v8.4S // .....................................................................................................*...................................................... - trn2 v14.2D, v1.2D, v16.2D // ...........................................................................................................*................................................ - // gap // ............................................................................................................................................................ - trn1 v25.2D, v1.2D, v16.2D // ..............................................................................................................*............................................. - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mul v28.4S, v10.4S, v9.S[0] // .......................................................................................*.................................................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - sub v13.4S, v3.4S, v14.4S // ................................................................................................................*........................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v28.4S, v4.4S, v8.S[0] // .......................................................................................................*.................................................... - add v26.4S, v3.4S, v14.4S // .................................................................................................................*.......................................... - // gap // ............................................................................................................................................................ - add v12.4S, v23.4S, v25.4S // ...................................................................................................................*........................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - sub v18.4S, v23.4S, v25.4S // ....................................................................................................................*....................................... - sqrdmulh v19.4S, v13.4S, v30.S[1] // .....................................................................................................................*...................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mul v22.4S, v13.4S, v30.S[0] // .......................................................................................................................*.................................... - sub v17.4S, v12.4S, v26.4S // ...............................................................................................................................*............................ - // gap // ............................................................................................................................................................ - add v23.4S, v2.4S, v28.4S // ..................................................................................................................................*......................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - add v10.4S, v12.4S, v26.4S // ........................................................................................................................*................................... - mul v14.4S, v18.4S, v9.S[2] // ..............................................................................................................................*............................. - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - sqrdmulh v4.4S, v18.4S, v9.S[3] // ............................................................................................................................*............................... - srshr v13.4S, v23.4S, #23 // .......................................................................................................................................*.................... - // gap // ............................................................................................................................................................ - srshr v27.4S, v10.4S, #23 // ...........................................................................................................................*................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v22.4S, v19.4S, v8.S[0] // ...................................................................................................................................*........................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - sqrdmulh v25.4S, v17.4S, v0.S[1] // ......................................................................................................................................*..................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v14.4S, v4.4S, v8.S[0] // .................................................................................................................................*.......................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v23.4S, v13.4S, v8.4S // ............................................................................................................................................*............... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v10.4S, v27.4S, v8.4S // ................................................................................................................................*........................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - sub v7.4S, v14.4S, v22.4S // ..........................................................................................................................................*................. - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mul v11.4S, v17.4S, v0.S[0] // ........................................................................................................................................*................... - add v21.4S, v14.4S, v22.4S // ...........................................................................................................................................*................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - sqrdmulh v5.4S, v7.4S, v0.S[1] // .............................................................................................................................................*.............. - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - srshr v18.4S, v21.4S, #23 // ..............................................................................................................................................*............. - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v11.4S, v25.4S, v8.S[0] // ...............................................................................................................................................*............ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - sub v14.4S, v2.4S, v28.4S // ......................................................................................................................*..................................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v21.4S, v18.4S, v8.4S // ................................................................................................................................................*........... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - sqrdmulh v12.4S, v14.4S, v6.S[3] // .........................................................................................................................*.................................. - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - add v26.4S, v15.4S, v11.4S // ...................................................................................................................................................*........ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mul v18.4S, v7.4S, v0.S[0] // .................................................................................................................................................*.......... - sub v0.4S, v15.4S, v11.4S // ..................................................................................................................................................*......... - // gap // ............................................................................................................................................................ - add v11.4S, v29.4S, v10.4S // .....................................................................................................................................*...................... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v18.4S, v5.4S, v8.S[0] // ....................................................................................................................................................*....... - sub v5.4S, v29.4S, v10.4S // ....................................................................................................................................*....................... - str q26, [x1, #32] // ........................................................................................................................................................*... - add v10.4S, v23.4S, v21.4S // ......................................................................................................................................................*..... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mul v27.4S, v14.4S, v6.S[2] // ..........................................................................................................................*................................. - str q11, [x1], #(16*4) // .........................................................................................................................................*.................. - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mls v27.4S, v12.4S, v8.S[0] // .............................................................................................................................*.............................. - str q10, [x1, #-48] // ..........................................................................................................................................................*. - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - sub v2.4S, v23.4S, v21.4S // .....................................................................................................................................................*...... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - sqrdmulh v19.4S, v5.4S, v6.S[1] // .......................................................................................................................................................*.... - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - // gap // ............................................................................................................................................................ - mul v13.4S, v2.4S, v6.S[0] // .........................................................................................................................................................*.. - add v29.4S, v27.4S, v18.4S // ...........................................................................................................................................................* - // gap // ............................................................................................................................................................ - - // original source code - // ldr q16, [x5, #160] // ...................*........................................................................................................................................ - // ldr q7, [x5, #112] // ........................................*................................................................................................................... - // ldr q25, [x5, #32] // ..............*............................................................................................................................................. - // ldr q12, [x4, #48] // ..............................................................*............................................................................................. - // ldr q24, [x4, #32] // .....................................................................................................*...................................................... - // ldr q3, [x4, #16] // .......................................................................*.................................................................................... - // ldr q17, [x1, #48] // ...*........................................................................................................................................................ - // ldr q11, [x1, #32] // ..*......................................................................................................................................................... - // ldr q26, [x1, #16] // *........................................................................................................................................................... - // ldr q21, [x1, #0] // .*.......................................................................................................................................................... - // ldr q31, [x5, #128] // ........................................................*................................................................................................... - // ldr q18, [x5, #176] // ................................................................*........................................................................................... - // ldr q20, [x5, #80] // .............*.............................................................................................................................................. - // ldr q22, [x5, #96] // .....*...................................................................................................................................................... - // trn1 v14.4S, v11.4S, v17.4S // ............*............................................................................................................................................... - // ldr q27, [x5, #48] // ......................*..................................................................................................................................... - // trn1 v30.4S, v21.4S, v26.4S // .........*.................................................................................................................................................. - // trn2 v23.4S, v21.4S, v26.4S // ..........*................................................................................................................................................. - // ldr q4, [x5], #(12*16) // .............................................*.............................................................................................................. - // trn2 v28.4S, v11.4S, v17.4S // ...........*................................................................................................................................................ - // ldr q17, [x5, #-48] // ..................................................*......................................................................................................... - // ldr q6, [x4], #64 // ....................................................................................*....................................................................... - // trn2 v26.2D, v30.2D, v14.2D // .................*.......................................................................................................................................... - // ldr q10, [x5, #-128] // .........................*.................................................................................................................................. - // trn2 v1.2D, v23.2D, v28.2D // ..................*......................................................................................................................................... - // trn1 v0.2D, v23.2D, v28.2D // .......................*.................................................................................................................................... - // trn1 v23.2D, v30.2D, v14.2D // ........................*................................................................................................................................... - // sub v29.4S, v26.4S, v1.4S // ..........................*................................................................................................................................. - // sub v21.4S, v23.4S, v0.4S // .............................*.............................................................................................................................. - // add v14.4S, v26.4S, v1.4S // ..........................................*................................................................................................................. - // sqrdmulh v9.4S, v29.4S, v20.4S // ................................*........................................................................................................................... - // add v30.4S, v23.4S, v0.4S // ......................................*..................................................................................................................... - // sqrdmulh v23.4S, v21.4S, v27.4S // ...................................*........................................................................................................................ - // ldr q13, [x5, #-176] // ............................................*............................................................................................................... - // mul v15.4S, v29.4S, v10.4S // .....................................*...................................................................................................................... - // sub v29.4S, v30.4S, v14.4S // ..............................................*............................................................................................................. - // ldr q19, [x2, #48] // ....*....................................................................................................................................................... - // ldr q5, [x2, #32] // ........*................................................................................................................................................... - // mul v28.4S, v21.4S, v25.4S // .........................................*.................................................................................................................. - // ldr q1, [x2, #16] // ......*..................................................................................................................................................... - // ldr q0, [x2, #0] // .......*.................................................................................................................................................... - // mls v28.4S, v23.4S, v8.S[0] // ...........................................*................................................................................................................ - // trn1 v11.4S, v5.4S, v19.4S // .....................*...................................................................................................................................... - // mls v15.4S, v9.4S, v8.S[0] // ...............................................*............................................................................................................ - // trn2 v9.4S, v0.4S, v1.4S // ...............*............................................................................................................................................ - // trn2 v10.4S, v5.4S, v19.4S // ....................*....................................................................................................................................... - // trn1 v0.4S, v0.4S, v1.4S // ................*........................................................................................................................................... - // sqrdmulh v19.4S, v29.4S, v13.4S // .....................................................*...................................................................................................... - // add v27.4S, v30.4S, v14.4S // .................................................*.......................................................................................................... - // mul v25.4S, v29.4S, v4.4S // ...................................................*........................................................................................................ - // trn1 v30.2D, v9.2D, v10.2D // ...........................*................................................................................................................................ - // trn1 v21.2D, v0.2D, v11.2D // ............................*............................................................................................................................... - // trn2 v26.2D, v9.2D, v10.2D // ..............................*............................................................................................................................. - // sub v5.4S, v28.4S, v15.4S // ......................................................*..................................................................................................... - // trn2 v2.2D, v0.2D, v11.2D // ...............................*............................................................................................................................ - // mls v25.4S, v19.4S, v8.S[0] // ..........................................................*................................................................................................. - // sub v0.4S, v21.4S, v30.4S // .................................*.......................................................................................................................... - // add v21.4S, v21.4S, v30.4S // ..................................*......................................................................................................................... - // sqrdmulh v29.4S, v5.4S, v13.4S // .........................................................*.................................................................................................. - // sub v23.4S, v2.4S, v26.4S // .......................................*.................................................................................................................... - // add v2.4S, v2.4S, v26.4S // ....................................*....................................................................................................................... - // mul v14.4S, v0.4S, v31.4S // ....................................................................*....................................................................................... - // add v19.4S, v28.4S, v15.4S // .............................................................*.............................................................................................. - // mul v20.4S, v5.4S, v4.4S // ...........................................................*................................................................................................ - // sub v5.4S, v21.4S, v2.4S // ................................................*........................................................................................................... - // mls v20.4S, v29.4S, v8.S[0] // ............................................................*............................................................................................... - // trn1 v9.4S, v27.4S, v19.4S // ......................................................................*..................................................................................... - // mul v29.4S, v23.4S, v16.4S // ...............................................................*............................................................................................ - // trn2 v11.4S, v27.4S, v19.4S // .................................................................*.......................................................................................... - // sqrdmulh v1.4S, v0.4S, v17.4S // .......................................................*.................................................................................................... - // trn2 v30.4S, v25.4S, v20.4S // ...................................................................*........................................................................................ - // sqrdmulh v26.4S, v23.4S, v18.4S // ........................................................................*................................................................................... - // trn1 v19.4S, v25.4S, v20.4S // .....................................................................*...................................................................................... - // trn1 v0.2D, v11.2D, v30.2D // .........................................................................*.................................................................................. - // sqrdmulh v28.4S, v5.4S, v7.4S // ...........................................................................*................................................................................ - // trn2 v30.2D, v11.2D, v30.2D // ..........................................................................*................................................................................. - // mul v5.4S, v5.4S, v22.4S // ..................................................................*......................................................................................... - // trn2 v18.2D, v9.2D, v19.2D // .............................................................................*.............................................................................. - // trn1 v4.2D, v9.2D, v19.2D // ............................................................................*............................................................................... - // mls v29.4S, v26.4S, v8.S[0] // ................................................................................*........................................................................... - // add v11.4S, v18.4S, v30.4S // .................................................................................*.......................................................................... - // mls v14.4S, v1.4S, v8.S[0] // ..............................................................................*............................................................................. - // add v20.4S, v4.4S, v0.4S // ...............................................................................*............................................................................ - // sub v18.4S, v18.4S, v30.4S // ..................................................................................................*......................................................... - // mls v5.4S, v28.4S, v8.S[0] // ...................................................................................*........................................................................ - // add v26.4S, v21.4S, v2.4S // ....................................................*....................................................................................................... - // sub v2.4S, v20.4S, v11.4S // .....................................................................................*...................................................................... - // mul v25.4S, v18.4S, v24.S[0] // ...............................................................................................................*............................................ - // sub v1.4S, v14.4S, v29.4S // ........................................................................................*................................................................... - // add v9.4S, v14.4S, v29.4S // .........................................................................................*.................................................................. - // sqrdmulh v21.4S, v2.4S, v6.S[3] // ...............................................................................................*............................................................ - // sub v0.4S, v4.4S, v0.4S // ..................................................................................*......................................................................... - // add v4.4S, v20.4S, v11.4S // ......................................................................................*..................................................................... - // sqrdmulh v10.4S, v1.4S, v7.4S // ............................................................................................*............................................................... - // sqrdmulh v29.4S, v18.4S, v24.S[1] // .........................................................................................................*.................................................. - // mul v23.4S, v1.4S, v22.4S // ................................................................................................*........................................................... - // trn2 v27.4S, v26.4S, v9.4S // .............................................................................................*.............................................................. - // mls v23.4S, v10.4S, v8.S[0] // .................................................................................................*.......................................................... - // srshr v19.4S, v4.4S, #23 // ...........................................................................................*................................................................ - // sqrdmulh v17.4S, v0.4S, v3.S[3] // .......................................................................................*.................................................................... - // trn1 v10.4S, v26.4S, v9.4S // ..............................................................................................*............................................................. - // mls v4.4S, v19.4S, v8.4S // ............................................................................................................*............................................... - // trn1 v18.4S, v5.4S, v23.4S // ......................................................................................................*..................................................... - // mls v25.4S, v29.4S, v8.S[0] // .................................................................................................................*.......................................... - // trn2 v29.4S, v5.4S, v23.4S // .......................................................................................................*.................................................... - // mul v13.4S, v2.4S, v6.S[2] // ...................................................................................................*........................................................ - // trn2 v30.2D, v10.2D, v18.2D // ...........................................................................................................*................................................ - // trn2 v16.2D, v27.2D, v29.2D // .............................................................................................................*.............................................. - // mul v2.4S, v0.4S, v3.S[2] // ..........................................................................................*................................................................. - // trn1 v7.2D, v10.2D, v18.2D // ..........................................................................................................*................................................. - // trn1 v28.2D, v27.2D, v29.2D // ..............................................................................................................*............................................. - // mls v2.4S, v17.4S, v8.S[0] // ....................................................................................................*....................................................... - // sub v10.4S, v30.4S, v16.4S // ................................................................................................................*........................................... - // add v19.4S, v30.4S, v16.4S // ..................................................................................................................*......................................... - // mls v13.4S, v21.4S, v8.S[0] // ........................................................................................................*................................................... - // add v5.4S, v7.4S, v28.4S // ...................................................................................................................*........................................ - // sub v14.4S, v7.4S, v28.4S // ....................................................................................................................*....................................... - // sqrdmulh v23.4S, v10.4S, v12.S[1] // .....................................................................................................................*...................................... - // sub v30.4S, v2.4S, v25.4S // .........................................................................................................................................*.................. - // mul v10.4S, v10.4S, v12.S[0] // ......................................................................................................................*..................................... - // add v15.4S, v5.4S, v19.4S // .........................................................................................................................*.................................. - // sqrdmulh v9.4S, v30.4S, v6.S[3] // ...........................................................................................................................................*................ - // mul v27.4S, v30.4S, v6.S[2] // ....................................................................................................................................................*....... - // srshr v29.4S, v15.4S, #23 // .............................................................................................................................*.............................. - // sqrdmulh v0.4S, v14.4S, v24.S[3] // ...........................................................................................................................*................................ - // mls v27.4S, v9.4S, v8.S[0] // ......................................................................................................................................................*..... - // mul v11.4S, v14.4S, v24.S[2] // ..........................................................................................................................*................................. - // sub v9.4S, v5.4S, v19.4S // .......................................................................................................................*.................................... - // mls v15.4S, v29.4S, v8.4S // ..................................................................................................................................*......................... - // mls v11.4S, v0.4S, v8.S[0] // ................................................................................................................................*........................... - // add v19.4S, v2.4S, v25.4S // ........................................................................................................................*................................... - // mls v10.4S, v23.4S, v8.S[0] // ..............................................................................................................................*............................. - // sub v5.4S, v4.4S, v15.4S // .................................................................................................................................................*.......... - // add v29.4S, v4.4S, v15.4S // ...............................................................................................................................................*............ - // sqrdmulh v1.4S, v9.4S, v3.S[1] // ...............................................................................................................................*............................ - // srshr v25.4S, v19.4S, #23 // ............................................................................................................................*............................... - // mul v16.4S, v9.4S, v3.S[0] // ....................................................................................................................................*....................... - // str q29, [x1], #(16*4) // .....................................................................................................................................................*...... - // sub v14.4S, v11.4S, v10.4S // ...................................................................................................................................*........................ - // add v29.4S, v11.4S, v10.4S // .....................................................................................................................................*...................... - // mls v19.4S, v25.4S, v8.4S // .................................................................................................................................*.......................... - // sqrdmulh v10.4S, v14.4S, v3.S[1] // ......................................................................................................................................*..................... - // srshr v18.4S, v29.4S, #23 // .......................................................................................................................................*.................... - // mls v16.4S, v1.4S, v8.S[0] // ........................................................................................................................................*................... - // mls v29.4S, v18.4S, v8.4S // ..........................................................................................................................................*................. - // mul v18.4S, v14.4S, v3.S[0] // .............................................................................................................................................*.............. - // sub v0.4S, v13.4S, v16.4S // ..............................................................................................................................................*............. - // add v26.4S, v13.4S, v16.4S // ............................................................................................................................................*............... - // mls v18.4S, v10.4S, v8.S[0] // ................................................................................................................................................*........... - // sub v2.4S, v19.4S, v29.4S // ........................................................................................................................................................*... - // add v29.4S, v19.4S, v29.4S // ...................................................................................................................................................*........ - // sqrdmulh v19.4S, v5.4S, v6.S[1] // .........................................................................................................................................................*.. - // str q26, [x1, #-32] // ..................................................................................................................................................*......... - // mul v13.4S, v2.4S, v6.S[0] // ..........................................................................................................................................................*. - // str q29, [x1, #-48] // .......................................................................................................................................................*.... - // add v29.4S, v27.4S, v18.4S // ...........................................................................................................................................................* + // Instructions: 30 + // Expected cycles: 23 + // Expected IPC: 1.30 + // + // Wall time: 0.38s + // User time: 0.38s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q23, [x1, #0] // .....*........................ + ldr q2, [x1, #16] // ......*....................... + // gap // .............................. + ldr q11, [x1, #32] // .......*...................... + ldr q27, [x1, #48] // ........*..................... + // gap // .............................. + ldr q25, [x5, #32] // .*............................ + // gap // .............................. + // gap // .............................. + ldr q12, [x5, #48] // ..*........................... + // gap // .............................. + // gap // .............................. + trn2 v10.4S, v23.4S, v2.4S // ..........*................... + trn1 v23.4S, v23.4S, v2.4S // .........*.................... + ldr q30, [x5, #16] // ....*......................... + trn2 v18.4S, v11.4S, v27.4S // ............*................. + trn1 v11.4S, v11.4S, v27.4S // ...........*.................. + ldr q0, [x2, #48] // ..........................*... + ldr q17, [x2, #16] // ............................*. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q13, [x5, #80] // *............................. + trn2 v28.2D, v10.2D, v18.2D // ..............*............... + trn2 v7.2D, v23.2D, v11.2D // .............*................ + // gap // .............................. + trn1 v11.2D, v23.2D, v11.2D // ...............*.............. + // gap // .............................. + // gap // .............................. + trn1 v27.2D, v10.2D, v18.2D // ................*............. + ldr q18, [x2, #32] // .........................*.... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v10.4S, v7.4S, v28.4S // .................*............ + add v24.4S, v7.4S, v28.4S // ..................*........... + // gap // .............................. + // gap // .............................. + add v1.4S, v11.4S, v27.4S // .....................*........ + sub v20.4S, v11.4S, v27.4S // ...................*.......... + // gap // .............................. + sqrdmulh v27.4S, v10.4S, v13.4S // ....................*......... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q7, [x5, #64] // ...*.......................... + mul v25.4S, v20.4S, v25.4S // ......................*....... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v9.4S, v20.4S, v12.4S // ........................*..... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v14.4S, v10.4S, v7.4S // ...........................*.. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v14.4S, v27.4S, v8.S[0] // .............................* + // gap // .............................. + sub v16.4S, v1.4S, v24.4S // .......................*...... + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q7, [x5, #80] // .............*................. + // ldr q21, [x5, #32] // ....*.......................... + // ldr q14, [x5, #48] // .....*......................... + // ldr q29, [x5, #64] // ........................*...... + // ldr q30, [x5, #16] // ........*...................... + // ldr q25, [x1, #0] // *.............................. + // ldr q1, [x1, #16] // .*............................. + // ldr q20, [x1, #32] // ..*............................ + // ldr q15, [x1, #48] // ...*........................... + // trn1 v26.4S, v25.4S, v1.4S // .......*....................... + // trn2 v25.4S, v25.4S, v1.4S // ......*........................ + // trn1 v1.4S, v20.4S, v15.4S // ..........*.................... + // trn2 v20.4S, v20.4S, v15.4S // .........*..................... + // trn2 v27.2D, v26.2D, v1.2D // ...............*............... + // trn2 v23.2D, v25.2D, v20.2D // ..............*................ + // trn1 v1.2D, v26.2D, v1.2D // ................*.............. + // trn1 v11.2D, v25.2D, v20.2D // .................*............. + // sub v13.4S, v27.4S, v23.4S // ...................*........... + // add v24.4S, v27.4S, v23.4S // ....................*.......... + // sub v27.4S, v1.4S, v11.4S // ......................*........ + // sqrdmulh v7.4S, v13.4S, v7.4S // .......................*....... + // add v1.4S, v1.4S, v11.4S // .....................*......... + // mul v25.4S, v27.4S, v21.4S // .........................*..... + // sub v16.4S, v1.4S, v24.4S // .............................*. + // sqrdmulh v9.4S, v27.4S, v14.4S // ..........................*.... + // ldr q18, [x2, #32] // ..................*............ + // ldr q0, [x2, #48] // ...........*................... + // mul v14.4S, v13.4S, v29.4S // ...........................*... + // ldr q17, [x2, #16] // ............*.................. + // mls v14.4S, v7.4S, v8.S[0] // ............................*.. sub count, count, #1 layer45678_start: - ldr q16, [x5, #160] // ......................................................e....................................................................................................................... - sqrdmulh v9.4S, v0.4S, v6.S[1] // .............................................................................................................................................................*................ - ldr q7, [x5, #112] // ...................................................e.......................................................................................................................... - ldr q25, [x5, #32] // ..........................e................................................................................................................................................... - // gap // .............................................................................................................................................................................. - ldr q12, [x4, #48] // ...............................................................................................e.............................................................................. - mul v15.4S, v5.4S, v6.S[0] // ..................................................................................................................................................*........................... - str q29, [x1, #-16] // .......................................................................................................................................................................*...... - add x1, x1, #64 // ............................................................................................................................................................................*. - ldr q24, [x4, #32] // ..............................................................................................e............................................................................... - sub v5.4S, v27.4S, v18.4S // ...............................................................................................................................................................*.............. - ldr q3, [x4, #16] // .............................................................................................e................................................................................ - ldr q17, [x1, #48] // ...e.......................................................................................................................................................................... - mls v15.4S, v19.4S, v8.S[0] // ....................................................................................................................................................*......................... - ldr q11, [x1, #32] // ..e........................................................................................................................................................................... - ldr q26, [x1, #16] // .e............................................................................................................................................................................ - ldr q21, [x1, #0] // e............................................................................................................................................................................. - // gap // .............................................................................................................................................................................. - sqrdmulh v2.4S, v2.4S, v6.S[1] // ........................................................................................................................................................*..................... - ldr q31, [x5, #128] // ....................................................e......................................................................................................................... - // gap // .............................................................................................................................................................................. - ldr q18, [x5, #176] // .......................................................e...................................................................................................................... - ldr q20, [x5, #80] // .............................e................................................................................................................................................ - // gap // .............................................................................................................................................................................. - mul v19.4S, v5.4S, v6.S[0] // .................................................................................................................................................................*............ - ldr q22, [x5, #96] // ..................................................e........................................................................................................................... - trn1 v14.4S, v11.4S, v17.4S // ......e....................................................................................................................................................................... - ldr q27, [x5, #48] // ...........................e.................................................................................................................................................. - // gap // .............................................................................................................................................................................. - trn1 v30.4S, v21.4S, v26.4S // ....e......................................................................................................................................................................... - sqrdmulh v29.4S, v5.4S, v6.S[1] // ..................................................................................................................................................................*........... - trn2 v23.4S, v21.4S, v26.4S // .....e........................................................................................................................................................................ - ldr q4, [x5], #(12*16) // ........................e..................................................................................................................................................... - trn2 v28.4S, v11.4S, v17.4S // .......e...................................................................................................................................................................... - ldr q17, [x5, #-48] // .....................................................e........................................................................................................................ + // Instructions: 174 + // Expected cycles: 129 + // Expected IPC: 1.35 + // + // Wall time: 2804.18s + // User time: 2804.18s + // + // ----------------------------------------------------------------------------- original position -----------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + ldr q27, [x2, #0] // ............*................................................................................................................................................................. + trn1 v11.4S, v18.4S, v0.4S // ..................*........................................................................................................................................................... + ldr q7, [x5, #272] // .............................e................................................................................................................................................ + ldr q23, [x5], #(12*16) // ........................*..................................................................................................................................................... + mls v25.4S, v9.4S, v8.S[0] // ..................................*........................................................................................................................................... + add v24.4S, v1.4S, v24.4S // .........................................*.................................................................................................................................... + ldr q9, [x5, #-96] // ..................................................*........................................................................................................................... + ldr q13, [x5, #-80] // ...................................................*.......................................................................................................................... // gap // .............................................................................................................................................................................. - mul v5.4S, v0.4S, v6.S[0] // ............................................................................................................................................................*................. - ldr q6, [x4], #64 // ............................................................................................e................................................................................. + sqrdmulh v1.4S, v16.4S, v30.4S // ...........................................*.................................................................................................................................. + ldr q3, [x5, #-64] // ....................................................*......................................................................................................................... + ldr q20, [x5, #-48] // .....................................................*........................................................................................................................ + trn1 v21.4S, v27.4S, v17.4S // ................*............................................................................................................................................................. + ldr q15, [x5, #-32] // ......................................................*....................................................................................................................... + ldr q26, [x5, #-16] // .......................................................*...................................................................................................................... + trn2 v27.4S, v27.4S, v17.4S // .................*............................................................................................................................................................ + trn2 v18.4S, v18.4S, v0.4S // ...................*.......................................................................................................................................................... + ldr q0, [x4], #64 // ............................................................................................*................................................................................. + mul v16.4S, v16.4S, v23.4S // ..........................................*................................................................................................................................... + sub v17.4S, v25.4S, v14.4S // .............................................*................................................................................................................................ + ldr q2, [x4, #-48] // .............................................................................................*................................................................................ + ldr q4, [x4, #-32] // ..............................................................................................*............................................................................... + trn2 v5.2D, v21.2D, v11.2D // ....................*......................................................................................................................................................... // gap // .............................................................................................................................................................................. - trn2 v26.2D, v30.2D, v14.2D // ........e..................................................................................................................................................................... - ldr q10, [x5, #-128] // ............................e................................................................................................................................................. + mls v16.4S, v1.4S, v8.S[0] // ............................................*................................................................................................................................. + trn2 v1.2D, v27.2D, v18.2D // .....................*........................................................................................................................................................ + ldr q22, [x4, #-16] // ...............................................................................................*.............................................................................. + trn1 v11.2D, v21.2D, v11.2D // ......................*....................................................................................................................................................... + ldr q21, [x5, #32] // ..........................e................................................................................................................................................... // gap // .............................................................................................................................................................................. - trn2 v1.2D, v23.2D, v28.2D // .........e.................................................................................................................................................................... - mls v13.4S, v2.4S, v8.S[0] // .........................................................................................................................................................*.................... + mul v23.4S, v17.4S, v23.4S // ...............................................*.............................................................................................................................. + add v25.4S, v25.4S, v14.4S // ..............................................*............................................................................................................................... + ldr q14, [x5, #48] // ...........................e.................................................................................................................................................. + sub v19.4S, v5.4S, v1.4S // .............................................................*................................................................................................................ + ldr q29, [x5, #64] // ............................e................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn1 v27.2D, v27.2D, v18.2D // .......................*...................................................................................................................................................... + sqrdmulh v18.4S, v17.4S, v30.4S // ................................................*............................................................................................................................. + ldr q30, [x5, #16] // .........................e.................................................................................................................................................... + trn1 v17.4S, v24.4S, v25.4S // ............................................................................*................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v0.2D, v23.2D, v28.2D // ...........e.................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v19.4S, v29.4S, v8.S[0] // ...................................................................................................................................................................*.......... - trn1 v23.2D, v30.2D, v14.2D // ..........e................................................................................................................................................................... + trn2 v25.4S, v24.4S, v25.4S // .............................................................................*................................................................................................ + mul v24.4S, v19.4S, v15.4S // ...............................................................*.............................................................................................................. // gap // .............................................................................................................................................................................. + sub v15.4S, v11.4S, v27.4S // ........................................................*..................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v29.4S, v26.4S, v1.4S // ...................................e.......................................................................................................................................... + sqrdmulh v26.4S, v19.4S, v26.4S // ................................................................*............................................................................................................. + add v27.4S, v11.4S, v27.4S // .........................................................*.................................................................................................................... // gap // .............................................................................................................................................................................. - mls v5.4S, v9.4S, v8.S[0] // ..............................................................................................................................................................*............... + add v11.4S, v5.4S, v1.4S // ..............................................................*............................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v23.4S, v18.4S, v8.S[0] // .................................................*............................................................................................................................ // gap // .............................................................................................................................................................................. - sub v21.4S, v23.4S, v0.4S // ..............................e............................................................................................................................................... - add v14.4S, v26.4S, v1.4S // ....................................e......................................................................................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v9.4S, v29.4S, v20.4S // ......................................e....................................................................................................................................... - str q15, [x2], #(16*4) // ........................................................................................................................................................................*..... - str q13, [x2, #-48] // .........................................................................................................................................................................*.... - add v30.4S, v23.4S, v0.4S // ...............................e.............................................................................................................................................. - sqrdmulh v23.4S, v21.4S, v27.4S // .................................e............................................................................................................................................ - str q19, [x2, #-16] // ...........................................................................................................................................................................*.. - ldr q13, [x5, #-176] // .........................e.................................................................................................................................................... - str q5, [x2, #-32] // ..........................................................................................................................................................................*... - add x2, x2, #64 // .............................................................................................................................................................................* - // gap // .............................................................................................................................................................................. - mul v15.4S, v29.4S, v10.4S // .....................................e........................................................................................................................................ - // gap // .............................................................................................................................................................................. - // gap // .............................................................................................................................................................................. - sub v29.4S, v30.4S, v14.4S // ........................................e..................................................................................................................................... - ldr q19, [x2, #48] // ...............e.............................................................................................................................................................. - ldr q5, [x2, #32] // ..............e............................................................................................................................................................... - mul v28.4S, v21.4S, v25.4S // ................................e............................................................................................................................................. - ldr q1, [x2, #16] // .............e................................................................................................................................................................ - ldr q0, [x2, #0] // ............e................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v28.4S, v23.4S, v8.S[0] // ..................................e........................................................................................................................................... + mul v1.4S, v15.4S, v3.4S // ..........................................................*................................................................................................................... + sub v3.4S, v27.4S, v11.4S // ..................................................................*........................................................................................................... // gap // .............................................................................................................................................................................. + add v27.4S, v27.4S, v11.4S // ...................................................................*.......................................................................................................... // gap // .............................................................................................................................................................................. - trn1 v11.4S, v5.4S, v19.4S // ..................e........................................................................................................................................................... // gap // .............................................................................................................................................................................. + sqrdmulh v11.4S, v15.4S, v20.4S // ...........................................................*.................................................................................................................. // gap // .............................................................................................................................................................................. - mls v15.4S, v9.4S, v8.S[0] // .......................................e...................................................................................................................................... // gap // .............................................................................................................................................................................. - trn2 v9.4S, v0.4S, v1.4S // .................e............................................................................................................................................................ - trn2 v10.4S, v5.4S, v19.4S // ...................e.......................................................................................................................................................... + trn1 v20.4S, v16.4S, v23.4S // ..............................................................................*............................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v0.4S, v0.4S, v1.4S // ................e............................................................................................................................................................. + trn2 v23.4S, v16.4S, v23.4S // ...............................................................................*.............................................................................................. + mls v24.4S, v26.4S, v8.S[0] // .................................................................*............................................................................................................ // gap // .............................................................................................................................................................................. - sqrdmulh v19.4S, v29.4S, v13.4S // ...........................................e.................................................................................................................................. - add v27.4S, v30.4S, v14.4S // .........................................e.................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v25.4S, v29.4S, v4.4S // ..........................................e................................................................................................................................... - trn1 v30.2D, v9.2D, v10.2D // .......................e...................................................................................................................................................... // gap // .............................................................................................................................................................................. + sqrdmulh v15.4S, v3.4S, v13.4S // .....................................................................*........................................................................................................ + trn2 v26.2D, v17.2D, v20.2D // ................................................................................*............................................................................................. // gap // .............................................................................................................................................................................. + trn1 v20.2D, v17.2D, v20.2D // ..................................................................................*........................................................................................... // gap // .............................................................................................................................................................................. - trn1 v21.2D, v0.2D, v11.2D // ......................e....................................................................................................................................................... - trn2 v26.2D, v9.2D, v10.2D // .....................e........................................................................................................................................................ // gap // .............................................................................................................................................................................. - sub v5.4S, v28.4S, v15.4S // .............................................e................................................................................................................................ - trn2 v2.2D, v0.2D, v11.2D // ....................e......................................................................................................................................................... - mls v25.4S, v19.4S, v8.S[0] // ............................................e................................................................................................................................. + mls v1.4S, v11.4S, v8.S[0] // ............................................................*................................................................................................................. + trn1 v11.2D, v25.2D, v23.2D // ...................................................................................*.......................................................................................... // gap // .............................................................................................................................................................................. + trn2 v23.2D, v25.2D, v23.2D // .................................................................................*............................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v0.4S, v21.4S, v30.4S // ........................................................e..................................................................................................................... - add v21.4S, v21.4S, v30.4S // .........................................................e.................................................................................................................... + mul v25.4S, v3.4S, v9.4S // ....................................................................*......................................................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v29.4S, v5.4S, v13.4S // ................................................e............................................................................................................................. // gap // .............................................................................................................................................................................. + sub v3.4S, v20.4S, v11.4S // ................................................................................................*............................................................................. // gap // .............................................................................................................................................................................. - sub v23.4S, v2.4S, v26.4S // .............................................................e................................................................................................................ - add v2.4S, v2.4S, v26.4S // ..............................................................e............................................................................................................... // gap // .............................................................................................................................................................................. - mul v14.4S, v0.4S, v31.4S // ..........................................................e................................................................................................................... - add v19.4S, v28.4S, v15.4S // ..............................................e............................................................................................................................... + mls v25.4S, v15.4S, v8.S[0] // ......................................................................*....................................................................................................... + add v11.4S, v20.4S, v11.4S // .................................................................................................*............................................................................ // gap // .............................................................................................................................................................................. + sub v20.4S, v1.4S, v24.4S // .......................................................................*...................................................................................................... // gap // .............................................................................................................................................................................. - mul v20.4S, v5.4S, v4.4S // ...............................................e.............................................................................................................................. // gap // .............................................................................................................................................................................. + add v24.4S, v1.4S, v24.4S // ........................................................................*..................................................................................................... + mul v1.4S, v3.4S, v2.S[2] // ..................................................................................................*........................................................................... // gap // .............................................................................................................................................................................. + sub v15.4S, v26.4S, v23.4S // .....................................................................................................*........................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v5.4S, v21.4S, v2.4S // ..................................................................e........................................................................................................... - mls v20.4S, v29.4S, v8.S[0] // .................................................e............................................................................................................................ + mul v9.4S, v20.4S, v9.4S // .........................................................................*.................................................................................................... + add v23.4S, v26.4S, v23.4S // ......................................................................................................*....................................................................... // gap // .............................................................................................................................................................................. - trn1 v9.4S, v27.4S, v19.4S // ............................................................................e................................................................................................. + trn1 v26.4S, v27.4S, v24.4S // ....................................................................................*......................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v20.4S, v13.4S // ..........................................................................*................................................................................................... + trn2 v27.4S, v27.4S, v24.4S // .....................................................................................*........................................................................................ // gap // .............................................................................................................................................................................. + sub v24.4S, v11.4S, v23.4S // ....................................................................................................................*......................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v29.4S, v23.4S, v16.4S // ...............................................................e.............................................................................................................. + sqrdmulh v3.4S, v3.4S, v2.S[3] // ...................................................................................................*.......................................................................... + add v11.4S, v11.4S, v23.4S // .....................................................................................................................*........................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v11.4S, v27.4S, v19.4S // .............................................................................e................................................................................................ - sqrdmulh v1.4S, v0.4S, v17.4S // ...........................................................e.................................................................................................................. // gap // .............................................................................................................................................................................. + mul v23.4S, v15.4S, v4.S[0] // .......................................................................................................*...................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v30.4S, v25.4S, v20.4S // ...............................................................................e.............................................................................................. - sqrdmulh v26.4S, v23.4S, v18.4S // ................................................................e............................................................................................................. - trn1 v19.4S, v25.4S, v20.4S // ..............................................................................e............................................................................................... + srshr v20.4S, v11.4S, #23 // ........................................................................................................................................*..................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v9.4S, v13.4S, v8.S[0] // ...........................................................................*.................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v0.2D, v11.2D, v30.2D // ...................................................................................e.......................................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v28.4S, v5.4S, v7.4S // .....................................................................e........................................................................................................ - trn2 v30.2D, v11.2D, v30.2D // .................................................................................e............................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v5.4S, v5.4S, v22.4S // ....................................................................e......................................................................................................... + mls v1.4S, v3.4S, v8.S[0] // ....................................................................................................*......................................................................... // gap // .............................................................................................................................................................................. - trn2 v18.2D, v9.2D, v19.2D // ................................................................................e............................................................................................. - trn1 v4.2D, v9.2D, v19.2D // ..................................................................................e........................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v29.4S, v26.4S, v8.S[0] // .................................................................e............................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - add v11.4S, v18.4S, v30.4S // ......................................................................................................e....................................................................... + sqrdmulh v13.4S, v15.4S, v4.S[1] // ........................................................................................................*..................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v14.4S, v1.4S, v8.S[0] // ............................................................e................................................................................................................. + trn1 v3.4S, v25.4S, v9.4S // ......................................................................................*....................................................................................... // gap // .............................................................................................................................................................................. - add v20.4S, v4.4S, v0.4S // .................................................................................................e............................................................................ - sub v18.4S, v18.4S, v30.4S // .....................................................................................................e........................................................................ // gap // .............................................................................................................................................................................. + trn2 v25.4S, v25.4S, v9.4S // .......................................................................................*...................................................................................... + mul v9.4S, v24.4S, v0.S[2] // ......................................................................................................................*....................................................... // gap // .............................................................................................................................................................................. - mls v5.4S, v28.4S, v8.S[0] // ......................................................................e....................................................................................................... - add v26.4S, v21.4S, v2.4S // ...................................................................e.......................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v2.4S, v20.4S, v11.4S // ....................................................................................................................e......................................................... - mul v25.4S, v18.4S, v24.S[0] // .......................................................................................................e...................................................................... + sqrdmulh v24.4S, v24.4S, v0.S[3] // .......................................................................................................................*...................................................... + trn2 v15.2D, v26.2D, v3.2D // ........................................................................................*..................................................................................... // gap // .............................................................................................................................................................................. + trn1 v3.2D, v26.2D, v3.2D // ..........................................................................................*................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v23.4S, v13.4S, v8.S[0] // .........................................................................................................*.................................................................... + trn1 v13.2D, v27.2D, v25.2D // ...........................................................................................*.................................................................................. // gap // .............................................................................................................................................................................. - sub v1.4S, v14.4S, v29.4S // .......................................................................e...................................................................................................... - add v9.4S, v14.4S, v29.4S // ........................................................................e..................................................................................................... + trn2 v27.2D, v27.2D, v25.2D // .........................................................................................*.................................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v21.4S, v2.4S, v6.S[3] // .......................................................................................................................e...................................................... - sub v0.4S, v4.4S, v0.4S // ................................................................................................e............................................................................. // gap // .............................................................................................................................................................................. + mls v11.4S, v20.4S, v8.4S // .........................................................................................................................................*.................................... // gap // .............................................................................................................................................................................. - add v4.4S, v20.4S, v11.4S // .....................................................................................................................e........................................................ // gap // .............................................................................................................................................................................. - sqrdmulh v10.4S, v1.4S, v7.4S // ..........................................................................e................................................................................................... + sub v25.4S, v3.4S, v13.4S // ..........................................................................................................*................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v9.4S, v24.4S, v8.S[0] // ........................................................................................................................*..................................................... + add v24.4S, v3.4S, v13.4S // ...........................................................................................................*.................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v29.4S, v18.4S, v24.S[1] // ........................................................................................................e..................................................................... + sub v13.4S, v15.4S, v27.4S // ...............................................................................................................*.............................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + add v27.4S, v15.4S, v27.4S // ................................................................................................................*............................................................. + mul v3.4S, v25.4S, v4.S[2] // ............................................................................................................*................................................................. // gap // .............................................................................................................................................................................. + sub v20.4S, v1.4S, v23.4S // .........................................................................................................................*.................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v23.4S, v1.4S, v22.4S // .........................................................................e.................................................................................................... + sqrdmulh v25.4S, v25.4S, v4.S[3] // .............................................................................................................*................................................................ + add v23.4S, v1.4S, v23.4S // ..........................................................................................................................*................................................... // gap // .............................................................................................................................................................................. - trn2 v27.4S, v26.4S, v9.4S // .....................................................................................e........................................................................................ + sub v1.4S, v24.4S, v27.4S // ..............................................................................................................................*............................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + add v27.4S, v24.4S, v27.4S // ...............................................................................................................................*.............................................. + sqrdmulh v24.4S, v13.4S, v22.S[1] // ..................................................................................................................*........................................................... // gap // .............................................................................................................................................................................. - mls v23.4S, v10.4S, v8.S[0] // ...........................................................................e.................................................................................................. + srshr v15.4S, v23.4S, #23 // ..........................................................................................................................................*................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mul v13.4S, v13.4S, v22.S[0] // .................................................................................................................*............................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - srshr v19.4S, v4.4S, #23 // ........................................................................................................................................e..................................... + srshr v26.4S, v27.4S, #23 // ............................................................................................................................................*................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v17.4S, v0.4S, v3.S[3] // ...................................................................................................e.......................................................................... - trn1 v10.4S, v26.4S, v9.4S // ....................................................................................e......................................................................................... + mls v3.4S, v25.4S, v8.S[0] // ..............................................................................................................*............................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v4.4S, v19.4S, v8.4S // .........................................................................................................................................e.................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v13.4S, v24.4S, v8.S[0] // ...................................................................................................................*.......................................................... // gap // .............................................................................................................................................................................. - trn1 v18.4S, v5.4S, v23.4S // ......................................................................................e....................................................................................... - mls v25.4S, v29.4S, v8.S[0] // .........................................................................................................e.................................................................... - trn2 v29.4S, v5.4S, v23.4S // .......................................................................................e...................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v13.4S, v2.4S, v6.S[2] // ......................................................................................................................e....................................................... + mul v25.4S, v20.4S, v0.S[2] // ...........................................................................................................................*.................................................. // gap // .............................................................................................................................................................................. - trn2 v30.2D, v10.2D, v18.2D // ........................................................................................e..................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v16.2D, v27.2D, v29.2D // .........................................................................................e.................................................................................... - mul v2.4S, v0.4S, v3.S[2] // ..................................................................................................e........................................................................... - trn1 v7.2D, v10.2D, v18.2D // ..........................................................................................e................................................................................... // gap // .............................................................................................................................................................................. - trn1 v28.2D, v27.2D, v29.2D // ...........................................................................................e.................................................................................. // gap // .............................................................................................................................................................................. + mul v24.4S, v1.4S, v2.S[0] // ................................................................................................................................*............................................. // gap // .............................................................................................................................................................................. - mls v2.4S, v17.4S, v8.S[0] // ....................................................................................................e......................................................................... - sub v10.4S, v30.4S, v16.4S // ...............................................................................................................e.............................................................. // gap // .............................................................................................................................................................................. - add v19.4S, v30.4S, v16.4S // ................................................................................................................e............................................................. + sub v18.4S, v3.4S, v13.4S // ...................................................................................................................................*.......................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v13.4S, v21.4S, v8.S[0] // ........................................................................................................................e..................................................... - add v5.4S, v7.4S, v28.4S // ...........................................................................................................e.................................................................. + sqrdmulh v1.4S, v1.4S, v2.S[1] // .................................................................................................................................*............................................ + add v13.4S, v3.4S, v13.4S // ....................................................................................................................................*......................................... // gap // .............................................................................................................................................................................. - sub v14.4S, v7.4S, v28.4S // ..........................................................................................................e................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mul v3.4S, v18.4S, v2.S[0] // .....................................................................................................................................*........................................ // gap // .............................................................................................................................................................................. - sqrdmulh v23.4S, v10.4S, v12.S[1] // ..................................................................................................................e........................................................... // gap // .............................................................................................................................................................................. + srshr v16.4S, v13.4S, #23 // ..............................................................................................................................................*............................... // gap // .............................................................................................................................................................................. - sub v30.4S, v2.4S, v25.4S // .........................................................................................................................e.................................................... - mul v10.4S, v10.4S, v12.S[0] // .................................................................................................................e............................................................ // gap // .............................................................................................................................................................................. - add v15.4S, v5.4S, v19.4S // ...............................................................................................................................e.............................................. + sqrdmulh v18.4S, v18.4S, v2.S[1] // ......................................................................................................................................*....................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v9.4S, v30.4S, v6.S[3] // ............................................................................................................................e................................................. + sqrdmulh v20.4S, v20.4S, v0.S[3] // ............................................................................................................................*................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v27.4S, v30.4S, v6.S[2] // ...........................................................................................................................e.................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v24.4S, v1.4S, v8.S[0] // ..................................................................................................................................*........................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - srshr v29.4S, v15.4S, #23 // ............................................................................................................................................e................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v0.4S, v14.4S, v24.S[3] // .............................................................................................................e................................................................ // gap // .............................................................................................................................................................................. + mls v3.4S, v18.4S, v8.S[0] // .......................................................................................................................................*...................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v27.4S, v9.4S, v8.S[0] // .............................................................................................................................e................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v25.4S, v20.4S, v8.S[0] // .............................................................................................................................*................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v11.4S, v14.4S, v24.S[2] // ............................................................................................................e................................................................. + sub v1.4S, v9.4S, v24.4S // ..........................................................................................................................................................*................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v9.4S, v5.4S, v19.4S // ..............................................................................................................................e............................................... + add v24.4S, v9.4S, v24.4S // ...........................................................................................................................................................*.................. + mls v23.4S, v15.4S, v8.4S // ...........................................................................................................................................*.................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v15.4S, v29.4S, v8.4S // .............................................................................................................................................e................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v27.4S, v26.4S, v8.4S // .............................................................................................................................................*................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + str q24, [x1, #32] // ......................................................................................................................................................................*....... + sub v24.4S, v25.4S, v3.4S // ...............................................................................................................................................................*.............. // gap // .............................................................................................................................................................................. - mls v11.4S, v0.4S, v8.S[0] // ..............................................................................................................e............................................................... + mls v13.4S, v16.4S, v8.4S // ...............................................................................................................................................*.............................. + add v25.4S, v25.4S, v3.4S // ................................................................................................................................................................*............. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mul v9.4S, v1.4S, v0.S[0] // ............................................................................................................................................................*................. // gap // .............................................................................................................................................................................. - add v19.4S, v2.4S, v25.4S // ..........................................................................................................................e................................................... - mls v10.4S, v23.4S, v8.S[0] // ...................................................................................................................e.......................................................... // gap // .............................................................................................................................................................................. + str q25, [x1, #48] // .......................................................................................................................................................................*...... + sub v25.4S, v11.4S, v27.4S // ................................................................................................................................................*............................. // gap // .............................................................................................................................................................................. + add v27.4S, v11.4S, v27.4S // .................................................................................................................................................*............................ + sqrdmulh v11.4S, v1.4S, v0.S[1] // .............................................................................................................................................................*................ // gap // .............................................................................................................................................................................. - sub v5.4S, v4.4S, v15.4S // ................................................................................................................................................e............................. - add v29.4S, v4.4S, v15.4S // .................................................................................................................................................e............................ + add v1.4S, v23.4S, v13.4S // ......................................................................................................................................................*....................... // gap // .............................................................................................................................................................................. - sqrdmulh v1.4S, v9.4S, v3.S[1] // .................................................................................................................................e............................................ // gap // .............................................................................................................................................................................. + sub v23.4S, v23.4S, v13.4S // .....................................................................................................................................................*........................ + mul v13.4S, v25.4S, v0.S[0] // ..................................................................................................................................................*........................... // gap // .............................................................................................................................................................................. - srshr v25.4S, v19.4S, #23 // ..........................................................................................................................................e................................... - mul v16.4S, v9.4S, v3.S[0] // ................................................................................................................................e............................................. + str q27, [x1], #(16*4) // ....................................................................................................................................................................*......... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - str q29, [x1], #(16*4) // ....................................................................................................................................................................e......... - // gap // .............................................................................................................................................................................. - sub v14.4S, v11.4S, v10.4S // ...................................................................................................................................e.......................................... - add v29.4S, v11.4S, v10.4S // ....................................................................................................................................e......................................... - mls v19.4S, v25.4S, v8.4S // ...........................................................................................................................................e.................................. + sqrdmulh v27.4S, v25.4S, v0.S[1] // ...................................................................................................................................................*.......................... + str q1, [x1, #-48] // .....................................................................................................................................................................*........ + add x1, x1, #64 // ............................................................................................................................................................................*. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + ldr q25, [x1, #0] // e............................................................................................................................................................................. + ldr q1, [x1, #16] // .e............................................................................................................................................................................ + mul v3.4S, v23.4S, v0.S[0] // .......................................................................................................................................................*...................... + ldr q20, [x1, #32] // ..e........................................................................................................................................................................... + ldr q15, [x1, #48] // ...e.......................................................................................................................................................................... // gap // .............................................................................................................................................................................. + sqrdmulh v23.4S, v23.4S, v0.S[1] // ........................................................................................................................................................*..................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v10.4S, v14.4S, v3.S[1] // ......................................................................................................................................e....................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - srshr v18.4S, v29.4S, #23 // ..............................................................................................................................................e............................... - mls v16.4S, v1.4S, v8.S[0] // ..................................................................................................................................e........................................... // gap // .............................................................................................................................................................................. + trn1 v26.4S, v25.4S, v1.4S // ....e......................................................................................................................................................................... + mul v18.4S, v24.4S, v0.S[0] // .................................................................................................................................................................*............ // gap // .............................................................................................................................................................................. + trn2 v25.4S, v25.4S, v1.4S // .....e........................................................................................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + sqrdmulh v24.4S, v24.4S, v0.S[1] // ..................................................................................................................................................................*........... + trn1 v1.4S, v20.4S, v15.4S // ......e....................................................................................................................................................................... // gap // .............................................................................................................................................................................. + trn2 v20.4S, v20.4S, v15.4S // .......e...................................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v29.4S, v18.4S, v8.4S // ...............................................................................................................................................e.............................. + mls v13.4S, v27.4S, v8.S[0] // ....................................................................................................................................................*......................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn2 v27.2D, v26.2D, v1.2D // ........e..................................................................................................................................................................... // gap // .............................................................................................................................................................................. - mul v18.4S, v14.4S, v3.S[0] // .....................................................................................................................................e........................................ // gap // .............................................................................................................................................................................. + mls v3.4S, v23.4S, v8.S[0] // .........................................................................................................................................................*.................... + trn2 v23.2D, v25.2D, v20.2D // .........e.................................................................................................................................................................... // gap // .............................................................................................................................................................................. + trn1 v1.2D, v26.2D, v1.2D // ..........e................................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v0.4S, v13.4S, v16.4S // ..........................................................................................................................................................e................... - add v26.4S, v13.4S, v16.4S // ...........................................................................................................................................................e.................. - mls v18.4S, v10.4S, v8.S[0] // .......................................................................................................................................e...................................... + mls v9.4S, v11.4S, v8.S[0] // ..............................................................................................................................................................*............... + trn1 v11.2D, v25.2D, v20.2D // ...........e.................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v2.4S, v19.4S, v29.4S // .....................................................................................................................................................e........................ + str q13, [x2], #(16*4) // ........................................................................................................................................................................*..... + sub v13.4S, v27.4S, v23.4S // ...................................e.......................................................................................................................................... // gap // .............................................................................................................................................................................. + mls v18.4S, v24.4S, v8.S[0] // ...................................................................................................................................................................*.......... + add v24.4S, v27.4S, v23.4S // ....................................e......................................................................................................................................... // gap // .............................................................................................................................................................................. - add v29.4S, v19.4S, v29.4S // ......................................................................................................................................................e....................... + str q3, [x2, #-48] // .........................................................................................................................................................................*.... + sub v27.4S, v1.4S, v11.4S // ..............................e............................................................................................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v19.4S, v5.4S, v6.S[1] // ...................................................................................................................................................e.......................... - str q26, [x1, #-32] // ......................................................................................................................................................................e....... + sqrdmulh v7.4S, v13.4S, v7.4S // ......................................e....................................................................................................................................... + add v1.4S, v1.4S, v11.4S // ...............................e.............................................................................................................................................. // gap // .............................................................................................................................................................................. + str q9, [x2, #-32] // ..........................................................................................................................................................................*... // gap // .............................................................................................................................................................................. - mul v13.4S, v2.4S, v6.S[0] // .......................................................................................................................................................e...................... // gap // .............................................................................................................................................................................. + mul v25.4S, v27.4S, v21.4S // ................................e............................................................................................................................................. // gap // .............................................................................................................................................................................. - str q29, [x1, #-48] // .....................................................................................................................................................................e........ // gap // .............................................................................................................................................................................. - add v29.4S, v27.4S, v18.4S // ................................................................................................................................................................e............. - - // original source code - // ldr q9, [x1, #0] // ...............e..............................................................................................................................................................|..............e.................................... - // ldr q10, [x1, #16] // ..............e...............................................................................................................................................................|.............e..................................... - // ldr q11, [x1, #32] // .............e................................................................................................................................................................|............e...................................... - // ldr q12, [x1, #48] // ...........e..................................................................................................................................................................|..........e........................................ - // trn1 v25.4s, v9.4s, v10.4s // ........................e.....................................................................................................................................................|.......................e........................... - // trn2 v26.4s, v9.4s, v10.4s // ..........................e...................................................................................................................................................|.........................e......................... - // trn1 v27.4s, v11.4s, v12.4s // ......................e.......................................................................................................................................................|.....................e............................. - // trn2 v28.4s, v11.4s, v12.4s // ............................e.................................................................................................................................................|...........................e....................... - // trn2 v11.2d, v25.2d, v27.2d // ................................e.............................................................................................................................................|...............................e................... - // trn2 v12.2d, v26.2d, v28.2d // ..................................e...........................................................................................................................................|.................................e................. - // trn1 v9.2d, v25.2d, v27.2d // ......................................e.......................................................................................................................................|.....................................e............. - // trn1 v10.2d, v26.2d, v28.2d // ....................................e.........................................................................................................................................|...................................e............... - // ldr q13, [x2, #0] // ..........................................................e...................................................................................................................|................................................... - // ldr q14, [x2, #16] // .........................................................e....................................................................................................................|................................................... - // ldr q15, [x2, #32] // .......................................................e......................................................................................................................|................................................... - // ldr q16, [x2, #48] // ......................................................e.......................................................................................................................|................................................... - // trn1 v25.4s, v13.4s, v14.4s // ................................................................e.............................................................................................................|................................................... - // trn2 v26.4s, v13.4s, v14.4s // ..............................................................e...............................................................................................................|................................................... - // trn1 v27.4s, v15.4s, v16.4s // ............................................................e.................................................................................................................|................................................... - // trn2 v28.4s, v15.4s, v16.4s // ...............................................................e..............................................................................................................|................................................... - // trn2 v15.2d, v25.2d, v27.2d // ........................................................................e.....................................................................................................|................................................... - // trn2 v16.2d, v26.2d, v28.2d // ......................................................................e.......................................................................................................|................................................... - // trn1 v13.2d, v25.2d, v27.2d // .....................................................................e........................................................................................................|................................................... - // trn1 v14.2d, v26.2d, v28.2d // ....................................................................e.........................................................................................................|................................................... - // ldr q0, [x5], #(12*16) // ...........................e..................................................................................................................................................|..........................e........................ - // ldr q4, [x5, #(-12*16 + 1*16)] // .................................................e............................................................................................................................|................................................e.. - // ldr q1, [x5, #(-12*16 + 2*16)] // ...e..........................................................................................................................................................................|..e................................................ - // ldr q5, [x5, #(-12*16 + 3*16)] // .......................e......................................................................................................................................................|......................e............................ - // ldr q2, [x5, #(-12*16 + 4*16)] // .................................e............................................................................................................................................|................................e.................. - // ldr q6, [x5, #(-12*16 + 5*16)] // ...................e..........................................................................................................................................................|..................e................................ - // sub v24.4s, v9.4s, v10.4s // .........................................e....................................................................................................................................|........................................e.......... - // add v9.4s, v9.4s, v10.4s // ..............................................e...............................................................................................................................|.............................................e..... - // mul v10.4s, v24.4s, v1.4s // ........................................................e.....................................................................................................................|................................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // ...............................................e..............................................................................................................................|..............................................e.... - // mls v10.4s, v24.4s, v8.s[0] // ...........................................................e..................................................................................................................|................................................... - // sub v24.4s, v11.4s, v12.4s // .......................................e......................................................................................................................................|......................................e............ - // add v11.4s, v11.4s, v12.4s // ..........................................e...................................................................................................................................|.........................................e......... - // mul v12.4s, v24.4s, v2.4s // ....................................................e.........................................................................................................................|................................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ...........................................e..................................................................................................................................|..........................................e........ - // mls v12.4s, v24.4s, v8.s[0] // .............................................................e................................................................................................................|................................................... - // sub v24.4s, v9.4s, v11.4s // .....................................................e........................................................................................................................|................................................... - // add v9.4s, v9.4s, v11.4s // ..................................................................e...........................................................................................................|................................................... - // mul v11.4s, v24.4s, v0.4s // ...................................................................e..........................................................................................................|................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // .................................................................e............................................................................................................|................................................... - // mls v11.4s, v24.4s, v8.s[0] // .........................................................................e....................................................................................................|................................................... - // sub v24.4s, v10.4s, v12.4s // .......................................................................e......................................................................................................|................................................... - // add v10.4s, v10.4s, v12.4s // ................................................................................e.............................................................................................|................................................... - // mul v12.4s, v24.4s, v0.4s // .................................................................................e............................................................................................|................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ............................................................................e.................................................................................................|................................................... - // mls v12.4s, v24.4s, v8.s[0] // ...................................................................................e..........................................................................................|................................................... - // ldr q0, [x5, #(-12*16 + 6*16)] // .....................e........................................................................................................................................................|....................e.............................. - // ldr q4, [x5, #(-12*16 + 7*16)] // ..e...........................................................................................................................................................................|.e................................................. - // ldr q1, [x5, #(-12*16 + 8*16)] // .................e............................................................................................................................................................|................e.................................. - // ldr q5, [x5, #(-12*16 + 9*16)] // .............................e................................................................................................................................................|............................e...................... - // ldr q2, [x5, #(-12*16 + 10*16)] // e.............................................................................................................................................................................e................................................... - // ldr q6, [x5, #(-12*16 + 11*16)] // ..................e...........................................................................................................................................................|.................e................................. - // sub v24.4s, v13.4s, v14.4s // ..........................................................................e...................................................................................................|................................................... - // add v13.4s, v13.4s, v14.4s // ...........................................................................e..................................................................................................|................................................... - // mul v14.4s, v24.4s, v1.4s // ...............................................................................e..............................................................................................|................................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // .......................................................................................e......................................................................................|................................................... - // mls v14.4s, v24.4s, v8.s[0] // ...................................................................................................e..........................................................................|................................................... - // sub v24.4s, v15.4s, v16.4s // .............................................................................e................................................................................................|................................................... - // add v15.4s, v15.4s, v16.4s // ..............................................................................e...............................................................................................|................................................... - // mul v16.4s, v24.4s, v2.4s // .....................................................................................e........................................................................................|................................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // .........................................................................................e....................................................................................|................................................... - // mls v16.4s, v24.4s, v8.s[0] // .................................................................................................e............................................................................|................................................... - // sub v24.4s, v13.4s, v15.4s // ..................................................................................e...........................................................................................|................................................... - // add v13.4s, v13.4s, v15.4s // .......................................................................................................e......................................................................|................................................... - // mul v15.4s, v24.4s, v0.4s // ..............................................................................................e...............................................................................|................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ............................................................................................e.................................................................................|................................................... - // mls v15.4s, v24.4s, v8.s[0] // ......................................................................................................e.......................................................................|................................................... - // sub v24.4s, v14.4s, v16.4s // ..........................................................................................................e...................................................................|................................................... - // add v14.4s, v14.4s, v16.4s // ...........................................................................................................e..................................................................|................................................... - // mul v16.4s, v24.4s, v0.4s // .................................................................................................................e............................................................|................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................................................................................................e..............................................................|................................................... - // mls v16.4s, v24.4s, v8.s[0] // ...................................................................................................................e..........................................................|................................................... - // trn1 v25.4s, v9.4s, v10.4s // ....................................................................................e.........................................................................................|................................................... - // trn2 v26.4s, v9.4s, v10.4s // ......................................................................................e.......................................................................................|................................................... - // trn1 v27.4s, v11.4s, v12.4s // ..........................................................................................e...................................................................................|................................................... - // trn2 v28.4s, v11.4s, v12.4s // ........................................................................................e.....................................................................................|................................................... - // trn2 v11.2d, v25.2d, v27.2d // ...............................................................................................e..............................................................................|................................................... - // trn2 v12.2d, v26.2d, v28.2d // .............................................................................................e................................................................................|................................................... - // trn1 v9.2d, v25.2d, v27.2d // ................................................................................................e.............................................................................|................................................... - // trn1 v10.2d, v26.2d, v28.2d // ...........................................................................................e..................................................................................|................................................... - // trn1 v25.4s, v13.4s, v14.4s // ......................................................................................................................e.......................................................|................................................... - // trn2 v26.4s, v13.4s, v14.4s // ..................................................................................................................e...........................................................|................................................... - // trn1 v27.4s, v15.4s, v16.4s // ........................................................................................................................e.....................................................|................................................... - // trn2 v28.4s, v15.4s, v16.4s // ..........................................................................................................................e...................................................|................................................... - // trn2 v15.2d, v25.2d, v27.2d // ............................................................................................................................e.................................................|................................................... - // trn2 v16.2d, v26.2d, v28.2d // .............................................................................................................................e................................................|................................................... - // trn1 v13.2d, v25.2d, v27.2d // ...............................................................................................................................e..............................................|................................................... - // trn1 v14.2d, v26.2d, v28.2d // ................................................................................................................................e.............................................|................................................... - // ldr q0, [x4], #64 // ...............................e..............................................................................................................................................|..............................e.................... - // ldr q1, [x4, #(-64 + 16)] // ..........e...................................................................................................................................................................|.........e......................................... - // ldr q2, [x4, #(-64 + 32)] // ........e.....................................................................................................................................................................|.......e........................................... - // ldr q3, [x4, #(-64 + 48)] // ....e.........................................................................................................................................................................|...e............................................... - // sub v24.4s, v9.4s, v10.4s // .............................................................................................................e................................................................|................................................... - // add v9.4s, v9.4s, v10.4s // ....................................................................................................e.........................................................................|................................................... - // mul v10.4s, v24.4s, v1.s[2] // ..............................................................................................................................e...............................................|................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................................................................................................................e........................................................|................................................... - // mls v10.4s, v24.4s, v8.s[0] // .................................................................................................................................e............................................|................................................... - // sub v24.4s, v11.4s, v12.4s // .....................................................................................................e........................................................................|................................................... - // add v11.4s, v11.4s, v12.4s // ..................................................................................................e...........................................................................|................................................... - // mul v12.4s, v24.4s, v2.s[0] // .........................................................................................................e....................................................................|................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................................................................................................................e.............................................................|................................................... - // mls v12.4s, v24.4s, v8.s[0] // .........................................................................................................................e....................................................|................................................... - // sub v24.4s, v13.4s, v14.4s // ......................................................................................................................................e.......................................|................................................... - // add v13.4s, v13.4s, v14.4s // .....................................................................................................................................e........................................|................................................... - // mul v14.4s, v24.4s, v2.s[2] // ................................................................................................................................................e.............................|................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..............................................................................................................................................e...............................|................................................... - // mls v14.4s, v24.4s, v8.s[0] // ...................................................................................................................................................e..........................|................................................... - // sub v24.4s, v15.4s, v16.4s // ..................................................................................................................................e...........................................|................................................... - // add v15.4s, v15.4s, v16.4s // ...................................................................................................................................e..........................................|................................................... - // mul v16.4s, v24.4s, v3.s[0] // .........................................................................................................................................e....................................|................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // .......................................................................................................................................e......................................|................................................... - // mls v16.4s, v24.4s, v8.s[0] // .....................................................................................................................................................e........................|................................................... - // sub v24.4s, v9.4s, v11.4s // ........................................................................................................e.....................................................................|................................................... - // add v9.4s, v9.4s, v11.4s // ..............................................................................................................e...............................................................|................................................... - // mul v11.4s, v24.4s, v0.s[2] // ...........................................................................................................................e..................................................|................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ............................................................................................................e.................................................................|................................................... - // mls v11.4s, v24.4s, v8.s[0] // ....................................................................................................................................e.........................................|................................................... - // sub v24.4s, v10.4s, v12.4s // ........................................................................................................................................e.....................................|................................................... - // add v10.4s, v10.4s, v12.4s // ....................................................................................................................................................e.........................|................................................... - // mul v12.4s, v24.4s, v0.s[2] // ............................................................................................................................................e.................................|................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................................................................................................................................e..................................|................................................... - // mls v12.4s, v24.4s, v8.s[0] // ...............................................................................................................................................e..............................|................................................... - // sub v24.4s, v13.4s, v15.4s // .................................................................................................................................................e............................|................................................... - // add v13.4s, v13.4s, v15.4s // ..........................................................................................................................................e...................................|................................................... - // mul v15.4s, v24.4s, v1.s[0] // ..........................................................................................................................................................e...................|................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................................................................................................................................................e.....................|................................................... - // mls v15.4s, v24.4s, v8.s[0] // .................................................................................................................................................................e............|................................................... - // sub v24.4s, v14.4s, v16.4s // ............................................................................................................................................................e.................|................................................... - // add v14.4s, v14.4s, v16.4s // .............................................................................................................................................................e................|................................................... - // mul v16.4s, v24.4s, v1.s[0] // ...................................................................................................................................................................e..........|................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...............................................................................................................................................................e..............|................................................... - // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................................................................................................e.......|................................................... - // srshr v24.4S, v9.4S, #23 // ....................................................................................................................e.........................................................|................................................... - // mls v9.4s, v24.4s, v8.4s // .......................................................................................................................e......................................................|................................................... - // srshr v24.4S, v10.4S, #23 // .........................................................................................................................................................e....................|................................................... - // mls v10.4s, v24.4s, v8.4s // ..............................................................................................................................................................e...............|................................................... - // srshr v24.4S, v13.4S, #23 // .............................................................................................................................................e................................|................................................... - // mls v13.4s, v24.4s, v8.4s // ..................................................................................................................................................e...........................|................................................... - // srshr v24.4S, v14.4S, #23 // ................................................................................................................................................................e.............|................................................... - // mls v14.4s, v24.4s, v8.4s // ..................................................................................................................................................................e...........|................................................... - // sub v24.4s, v9.4s, v13.4s // ......................................................................................................................................................e.......................|................................................... - // add v9.4s, v9.4s, v13.4s // .......................................................................................................................................................e......................|................................................... - // mul v13.4s, v24.4s, v0.s[0] // .....*........................................................................................................................................................................|....*.............................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................................................................................................................................................................e....|................................................... - // mls v13.4s, v24.4s, v8.s[0] // ............*.................................................................................................................................................................|...........*....................................... - // sub v24.4s, v10.4s, v14.4s // .......................................................................................................................................................................e......|................................................... - // add v10.4s, v10.4s, v14.4s // ........................................................................................................................................................................e.....|................................................... - // mul v14.4s, v24.4s, v0.s[0] // ...........................................................................................................................................................................e..|................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................*.............................................................................................................................................................|...............*................................... - // mls v14.4s, v24.4s, v8.s[0] // ...................................*..........................................................................................................................................|..................................*................ - // sub v24.4s, v11.4s, v15.4s // ....................................................................................................................................................................e.........|................................................... - // add v11.4s, v11.4s, v15.4s // .....................................................................................................................................................................e........|................................................... - // mul v15.4s, v24.4s, v0.s[0] // ..............................*...............................................................................................................................................|.............................*..................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .*............................................................................................................................................................................|*.................................................. - // mls v15.4s, v24.4s, v8.s[0] // ........................................*.....................................................................................................................................|.......................................*........... - // sub v24.4s, v12.4s, v16.4s // .........*....................................................................................................................................................................|........*.......................................... - // add v12.4s, v12.4s, v16.4s // .............................................................................................................................................................................e|................................................... - // mul v16.4s, v24.4s, v0.s[0] // ....................*.........................................................................................................................................................|...................*............................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................*....................................................................................................................................................|........................*.......................... - // mls v16.4s, v24.4s, v8.s[0] // .....................................*........................................................................................................................................|....................................*.............. - // str q9, [x1], #(16*4) // ...........................................................................................................................................................e..................|................................................... - // str q10, [x1, #(-16*4 + 1*16)] // ............................................................................................................................................................................e.|................................................... - // str q11, [x1, #(-16*4 + 2*16)] // ..........................................................................................................................................................................e...|................................................... - // str q12, [x1, #(-16*4 + 3*16)] // ......*.......................................................................................................................................................................|.....*............................................. - // str q13, [x2], #(16*4) // ............................................*.................................................................................................................................|...........................................*....... - // str q14, [x2, #(-16*4 + 1*16)] // .............................................*................................................................................................................................|............................................*...... - // str q15, [x2, #(-16*4 + 2*16)] // ..................................................*...........................................................................................................................|.................................................*. - // str q16, [x2, #(-16*4 + 3*16)] // ................................................*.............................................................................................................................|...............................................*... - // add x1, x1, #64 // .......*......................................................................................................................................................................|......*............................................ - // add x2, x2, #64 // ...................................................*..........................................................................................................................|..................................................* + str q18, [x2, #-16] // ...........................................................................................................................................................................*.. + sub v16.4S, v1.4S, v24.4S // ........................................e..................................................................................................................................... + add x2, x2, #64 // .............................................................................................................................................................................* + sqrdmulh v9.4S, v27.4S, v14.4S // .................................e............................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q18, [x2, #32] // ..............e............................................................................................................................................................... + ldr q0, [x2, #48] // ...............e.............................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v14.4S, v13.4S, v29.4S // .....................................e........................................................................................................................................ + ldr q17, [x2, #16] // .............e................................................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v14.4S, v7.4S, v8.S[0] // .......................................e...................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + + // ------------------------------------------------------------------------------------------------------------------------------------------------------------------ new position -------------------------------------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 300 325 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|-------------- + // ldr q9, [x1, #0] // ......................................................................................................................................e.....................................'.......................................................................................................................................~............................... + // ldr q10, [x1, #16] // .......................................................................................................................................e....................................'........................................................................................................................................~.............................. + // ldr q11, [x1, #32] // .........................................................................................................................................e..................................'..........................................................................................................................................~............................ + // ldr q12, [x1, #48] // ..........................................................................................................................................e.................................'...........................................................................................................................................~........................... + // trn1 v25.4s, v9.4s, v10.4s // ............................................................................................................................................e...............................'.............................................................................................................................................~......................... + // trn2 v26.4s, v9.4s, v10.4s // ..............................................................................................................................................e.............................'...............................................................................................................................................~....................... + // trn1 v27.4s, v11.4s, v12.4s // ................................................................................................................................................e...........................'.................................................................................................................................................~..................... + // trn2 v28.4s, v11.4s, v12.4s // .................................................................................................................................................e..........................'..................................................................................................................................................~.................... + // trn2 v11.2d, v25.2d, v27.2d // ...................................................................................................................................................e........................'....................................................................................................................................................~.................. + // trn2 v12.2d, v26.2d, v28.2d // .....................................................................................................................................................e......................'......................................................................................................................................................~................ + // trn1 v9.2d, v25.2d, v27.2d // ......................................................................................................................................................e.....................'.......................................................................................................................................................~............... + // trn1 v10.2d, v26.2d, v28.2d // ........................................................................................................................................................e...................'.........................................................................................................................................................~............. + // ldr q13, [x2, #0] // ............................................................................................................................................................................*....................................................................................................................................................................... + // ldr q14, [x2, #16] // ..........................................................................................................................................................................e.'....................................................................................................................................................................... + // ldr q15, [x2, #32] // .......................................................................................................................................................................e....'....................................................................................................................................................................... + // ldr q16, [x2, #48] // ........................................................................................................................................................................e...'....................................................................................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // .........~..................................................................................................................................................................'..........*............................................................................................................................................................ + // trn2 v26.4s, v13.4s, v14.4s // ............~...............................................................................................................................................................'.............*......................................................................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ............................................................................................................................................................................'*...................................................................................................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // .............~..............................................................................................................................................................'..............*........................................................................................................................................................ + // trn2 v15.2d, v25.2d, v27.2d // ...................~........................................................................................................................................................'....................*.................................................................................................................................................. + // trn2 v16.2d, v26.2d, v28.2d // .....................~......................................................................................................................................................'......................*................................................................................................................................................ + // trn1 v13.2d, v25.2d, v27.2d // .......................~....................................................................................................................................................'........................*.............................................................................................................................................. + // trn1 v14.2d, v26.2d, v28.2d // ..............................~.............................................................................................................................................'...............................*....................................................................................................................................... + // ldr q0, [x5], #(12*16) // .~..........................................................................................................................................................................'..*.................................................................................................................................................................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ................................e...........................................................................................................................................'.................................~..................................................................................................................................... + // ldr q1, [x5, #(-12*16 + 2*16)] // ........................e...................................................................................................................................................'.........................~............................................................................................................................................. + // ldr q5, [x5, #(-12*16 + 3*16)] // ...........................e................................................................................................................................................'............................~.......................................................................................................................................... + // ldr q2, [x5, #(-12*16 + 4*16)] // .............................e..............................................................................................................................................'..............................~........................................................................................................................................ + // ldr q6, [x5, #(-12*16 + 5*16)] // e...........................................................................................................................................................................'.~..................................................................................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ..............................................................................................................................................................e.............'...............................................................................................................................................................~....... + // add v9.4s, v9.4s, v10.4s // ................................................................................................................................................................e...........'.................................................................................................................................................................~..... + // mul v10.4s, v24.4s, v1.4s // ..................................................................................................................................................................e.........'...................................................................................................................................................................~... + // sqrdmulh v24.4s, v24.4s, v5.4s // ......................................................................................................................................................................e.....'....................................................................................................................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ..~.........................................................................................................................................................................'...*................................................................................................................................................................... + // sub v24.4s, v11.4s, v12.4s // ..........................................................................................................................................................e.................'...........................................................................................................................................................~........... + // add v11.4s, v11.4s, v12.4s // ............................................................................................................................................................e...............'.............................................................................................................................................................~......... + // mul v12.4s, v24.4s, v2.4s // .........................................................................................................................................................................e..'....................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ...............................................................................................................................................................e............'................................................................................................................................................................~...... + // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................................................................................................................e'....................................................................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ....................................................................................................................................................................e.......'.....................................................................................................................................................................~. + // add v9.4s, v9.4s, v11.4s // ...~........................................................................................................................................................................'....*.................................................................................................................................................................. + // mul v11.4s, v24.4s, v0.4s // ...............~............................................................................................................................................................'................*...................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ......~.....................................................................................................................................................................'.......*............................................................................................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ....................~.......................................................................................................................................................'.....................*................................................................................................................................................. + // sub v24.4s, v10.4s, v12.4s // ................~...........................................................................................................................................................'.................*..................................................................................................................................................... + // add v10.4s, v10.4s, v12.4s // ..........................~.................................................................................................................................................'...........................*........................................................................................................................................... + // mul v12.4s, v24.4s, v0.4s // .........................~..................................................................................................................................................'..........................*............................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................~............................................................................................................................................'................................*...................................................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ........................................~...................................................................................................................................'.........................................*............................................................................................................................. + // ldr q0, [x5, #(-12*16 + 6*16)] // ....~.......................................................................................................................................................................'.....*................................................................................................................................................................. + // ldr q4, [x5, #(-12*16 + 7*16)] // .....~......................................................................................................................................................................'......*................................................................................................................................................................ + // ldr q1, [x5, #(-12*16 + 8*16)] // .......~....................................................................................................................................................................'........*.............................................................................................................................................................. + // ldr q5, [x5, #(-12*16 + 9*16)] // ........~...................................................................................................................................................................'.........*............................................................................................................................................................. + // ldr q2, [x5, #(-12*16 + 10*16)] // ..........~.................................................................................................................................................................'...........*........................................................................................................................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ...........~................................................................................................................................................................'............*.......................................................................................................................................................... + // sub v24.4s, v13.4s, v14.4s // ....................................~.......................................................................................................................................'.....................................*................................................................................................................................. + // add v13.4s, v13.4s, v14.4s // ......................................~.....................................................................................................................................'.......................................*............................................................................................................................... + // mul v14.4s, v24.4s, v1.4s // .........................................~..................................................................................................................................'..........................................*............................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v5.4s // ............................................~...............................................................................................................................'.............................................*......................................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ...................................................~........................................................................................................................'....................................................*.................................................................................................................. + // sub v24.4s, v15.4s, v16.4s // ............................~...............................................................................................................................................'.............................*......................................................................................................................................... + // add v15.4s, v15.4s, v16.4s // .......................................~....................................................................................................................................'........................................*.............................................................................................................................. + // mul v16.4s, v24.4s, v2.4s // ...................................~........................................................................................................................................'....................................*.................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v6.4s // .....................................~......................................................................................................................................'......................................*................................................................................................................................ + // mls v16.4s, v24.4s, v8.s[0] // ...............................................~............................................................................................................................'................................................*...................................................................................................................... + // sub v24.4s, v13.4s, v15.4s // ..........................................~.................................................................................................................................'...........................................*........................................................................................................................... + // add v13.4s, v13.4s, v15.4s // ...........................................~................................................................................................................................'............................................*.......................................................................................................................... + // mul v15.4s, v24.4s, v0.4s // ......................................................~.....................................................................................................................'.......................................................*............................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ................................................~...........................................................................................................................'.................................................*..................................................................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ........................................................~...................................................................................................................'.........................................................*............................................................................................................. + // sub v24.4s, v14.4s, v16.4s // ..........................................................~.................................................................................................................'...........................................................*........................................................................................................... + // add v14.4s, v14.4s, v16.4s // ...........................................................~................................................................................................................'............................................................*.......................................................................................................... + // mul v16.4s, v24.4s, v0.4s // ..............................................................~.............................................................................................................'...............................................................*....................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // .................................................................~..........................................................................................................'..................................................................*.................................................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ........................................................................~...................................................................................................'.........................................................................*............................................................................................. + // trn1 v25.4s, v9.4s, v10.4s // .................................~..........................................................................................................................................'..................................*.................................................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ..................................~.........................................................................................................................................'...................................*................................................................................................................................... + // trn1 v27.4s, v11.4s, v12.4s // .............................................~..............................................................................................................................'..............................................*........................................................................................................................ + // trn2 v28.4s, v11.4s, v12.4s // ..............................................~.............................................................................................................................'...............................................*....................................................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // .................................................~..........................................................................................................................'..................................................*.................................................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // .....................................................~......................................................................................................................'......................................................*................................................................................................................ + // trn1 v9.2d, v25.2d, v27.2d // ..................................................~.........................................................................................................................'...................................................*................................................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ....................................................~.......................................................................................................................'.....................................................*................................................................................................................. + // trn1 v25.4s, v13.4s, v14.4s // ................................................................~...........................................................................................................'.................................................................*..................................................................................................... + // trn2 v26.4s, v13.4s, v14.4s // ..................................................................~.........................................................................................................'...................................................................*................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ...........................................................................~................................................................................................'............................................................................*.......................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // ............................................................................~...............................................................................................'.............................................................................*......................................................................................... + // trn2 v15.2d, v25.2d, v27.2d // ...............................................................................~............................................................................................'................................................................................*...................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ...................................................................................~........................................................................................'....................................................................................*.................................................................................. + // trn1 v13.2d, v25.2d, v27.2d // ................................................................................~...........................................................................................'.................................................................................*..................................................................................... + // trn1 v14.2d, v26.2d, v28.2d // ..................................................................................~.........................................................................................'...................................................................................*................................................................................... + // ldr q0, [x4], #64 // ..............~.............................................................................................................................................................'...............*....................................................................................................................................................... + // ldr q1, [x4, #(-64 + 16)] // .................~..........................................................................................................................................................'..................*.................................................................................................................................................... + // ldr q2, [x4, #(-64 + 32)] // ..................~.........................................................................................................................................................'...................*................................................................................................................................................... + // ldr q3, [x4, #(-64 + 48)] // ......................~.....................................................................................................................................................'.......................*............................................................................................................................................... + // sub v24.4s, v9.4s, v10.4s // .......................................................~....................................................................................................................'........................................................*.............................................................................................................. + // add v9.4s, v9.4s, v10.4s // .........................................................~..................................................................................................................'..........................................................*............................................................................................................ + // mul v10.4s, v24.4s, v1.s[2] // ............................................................~...............................................................................................................'.............................................................*......................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ....................................................................~.......................................................................................................'.....................................................................*................................................................................................. + // mls v10.4s, v24.4s, v8.s[0] // .........................................................................~..................................................................................................'..........................................................................*............................................................................................ + // sub v24.4s, v11.4s, v12.4s // .............................................................~..............................................................................................................'..............................................................*........................................................................................................ + // add v11.4s, v11.4s, v12.4s // ...............................................................~............................................................................................................'................................................................*...................................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ......................................................................~.....................................................................................................'.......................................................................*............................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........................................................................~.................................................................................................'...........................................................................*........................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // .................................................................................~..........................................................................................'..................................................................................*.................................................................................... + // sub v24.4s, v13.4s, v14.4s // .....................................................................................~......................................................................................'......................................................................................*................................................................................ + // add v13.4s, v13.4s, v14.4s // .......................................................................................~....................................................................................'........................................................................................*.............................................................................. + // mul v14.4s, v24.4s, v2.s[2] // ..........................................................................................~.................................................................................'...........................................................................................*........................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ............................................................................................~...............................................................................'.............................................................................................*......................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ....................................................................................................~.......................................................................'.....................................................................................................*................................................................. + // sub v24.4s, v15.4s, v16.4s // ........................................................................................~...................................................................................'.........................................................................................*............................................................................. + // add v15.4s, v15.4s, v16.4s // .........................................................................................~..................................................................................'..........................................................................................*............................................................................ + // mul v16.4s, v24.4s, v3.s[0] // ..................................................................................................~.........................................................................'...................................................................................................*................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................................................................................................~...........................................................................'.................................................................................................*..................................................................... + // mls v16.4s, v24.4s, v8.s[0] // .....................................................................................................~......................................................................'......................................................................................................*................................................................ + // sub v24.4s, v9.4s, v11.4s // ...................................................................~........................................................................................................'....................................................................*.................................................................................................. + // add v9.4s, v9.4s, v11.4s // .....................................................................~......................................................................................................'......................................................................*................................................................................................ + // mul v11.4s, v24.4s, v0.s[2] // .............................................................................~..............................................................................................'..............................................................................*........................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..............................................................................~.............................................................................................'...............................................................................*....................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ......................................................................................~.....................................................................................'.......................................................................................*............................................................................... + // sub v24.4s, v10.4s, v12.4s // ...........................................................................................~................................................................................'............................................................................................*.......................................................................... + // add v10.4s, v10.4s, v12.4s // .............................................................................................~..............................................................................'..............................................................................................*........................................................................ + // mul v12.4s, v24.4s, v0.s[2] // ......................................................................................................~.....................................................................'.......................................................................................................*............................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..............................................................................................................~.............................................................'...............................................................................................................*....................................................... + // mls v12.4s, v24.4s, v8.s[0] // .................................................................................................................~..........................................................'..................................................................................................................*.................................................... + // sub v24.4s, v13.4s, v15.4s // ..............................................................................................~.............................................................................'...............................................................................................*....................................................................... + // add v13.4s, v13.4s, v15.4s // ...............................................................................................~............................................................................'................................................................................................*...................................................................... + // mul v15.4s, v24.4s, v1.s[0] // .......................................................................................................~....................................................................'........................................................................................................*.............................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................................................................................~..................................................................'..........................................................................................................*............................................................ + // mls v15.4s, v24.4s, v8.s[0] // ...............................................................................................................~............................................................'................................................................................................................*...................................................... + // sub v24.4s, v14.4s, v16.4s // ........................................................................................................~...................................................................'.........................................................................................................*............................................................. + // add v14.4s, v14.4s, v16.4s // ..........................................................................................................~.................................................................'...........................................................................................................*........................................................... + // mul v16.4s, v24.4s, v1.s[0] // ...........................................................................................................~................................................................'............................................................................................................*.......................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................................................................................................~..............................................................'..............................................................................................................*........................................................ + // mls v16.4s, v24.4s, v8.s[0] // ................................................................................................................~...........................................................'.................................................................................................................*..................................................... + // srshr v24.4S, v9.4S, #23 // .......................................................................~....................................................................................................'........................................................................*.............................................................................................. + // mls v9.4s, v24.4s, v8.4s // ....................................................................................~.......................................................................................'.....................................................................................*................................................................................. + // srshr v24.4S, v10.4S, #23 // .................................................................................................~..........................................................................'..................................................................................................*.................................................................... + // mls v10.4s, v24.4s, v8.4s // ....................................................................................................................~.......................................................'.....................................................................................................................*................................................. + // srshr v24.4S, v13.4S, #23 // ...................................................................................................~........................................................................'....................................................................................................*.................................................................. + // mls v13.4s, v24.4s, v8.4s // .....................................................................................................................~......................................................'......................................................................................................................*................................................ + // srshr v24.4S, v14.4S, #23 // ............................................................................................................~...............................................................'.............................................................................................................*......................................................... + // mls v14.4s, v24.4s, v8.4s // ........................................................................................................................~...................................................'.........................................................................................................................*............................................. + // sub v24.4s, v9.4s, v13.4s // ............................................................................................................................~...............................................'.............................................................................................................................*......................................... + // add v9.4s, v9.4s, v13.4s // .............................................................................................................................~..............................................'..............................................................................................................................*........................................ + // mul v13.4s, v24.4s, v0.s[0] // .................................................................................................................................~..........................................'..................................................................................................................................*.................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................~........................................'....................................................................................................................................*.................................. + // mls v13.4s, v24.4s, v8.s[0] // ..................................................................................................................................................~.........................'...................................................................................................................................................*................... + // sub v24.4s, v10.4s, v14.4s // ................................................................................................................................~...........................................'.................................................................................................................................*..................................... + // add v10.4s, v10.4s, v14.4s // ...............................................................................................................................~............................................'................................................................................................................................*...................................... + // mul v14.4s, v24.4s, v0.s[0] // ........................................................................................................................................~...................................'.........................................................................................................................................*............................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................................................................................................~................................'............................................................................................................................................*.......................... + // mls v14.4s, v24.4s, v8.s[0] // ....................................................................................................................................................~.......................'.....................................................................................................................................................*................. + // sub v24.4s, v11.4s, v15.4s // ..................................................................................................................~.........................................................'...................................................................................................................*................................................... + // add v11.4s, v11.4s, v15.4s // ...................................................................................................................~........................................................'....................................................................................................................*.................................................. + // mul v15.4s, v24.4s, v0.s[0] // ..........................................................................................................................~.................................................'...........................................................................................................................*........................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................................................................................................~.............................................'...............................................................................................................................*....................................... + // mls v15.4s, v24.4s, v8.s[0] // .......................................................................................................................................................~....................'........................................................................................................................................................*.............. + // sub v24.4s, v12.4s, v16.4s // .......................................................................................................................~....................................................'........................................................................................................................*.............................................. + // add v12.4s, v12.4s, v16.4s // .........................................................................................................................~..................................................'..........................................................................................................................*............................................ + // mul v16.4s, v24.4s, v0.s[0] // .............................................................................................................................................~..............................'..............................................................................................................................................*........................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................................................................................................~............................'................................................................................................................................................*...................... + // mls v16.4s, v24.4s, v8.s[0] // ...........................................................................................................................................................~................'............................................................................................................................................................*.......... + // str q9, [x1], #(16*4) // ..................................................................................................................................~.........................................'...................................................................................................................................*................................... + // str q10, [x1, #(-16*4 + 1*16)] // ....................................................................................................................................~.......................................'.....................................................................................................................................*................................. + // str q11, [x1, #(-16*4 + 2*16)] // ......................................................................................................................~.....................................................'.......................................................................................................................*............................................... + // str q12, [x1, #(-16*4 + 3*16)] // ...........................................................................................................................~................................................'............................................................................................................................*.......................................... + // str q13, [x2], #(16*4) // .........................................................................................................................................................~..................'..........................................................................................................................................................*............ + // str q14, [x2, #(-16*4 + 1*16)] // .............................................................................................................................................................~..............'..............................................................................................................................................................*........ + // str q15, [x2, #(-16*4 + 2*16)] // .................................................................................................................................................................~..........'..................................................................................................................................................................*.... + // str q16, [x2, #(-16*4 + 3*16)] // ...................................................................................................................................................................~........'....................................................................................................................................................................*.. + // add x1, x1, #64 // .....................................................................................................................................~......................................'......................................................................................................................................*................................ + // add x2, x2, #64 // .....................................................................................................................................................................~......'......................................................................................................................................................................* sub count, count, #1 cbnz count, layer45678_start - str q29, [x1, #-16] // ..*............... - add x1, x1, #64 // ...*.............. - sqrdmulh v21.4S, v0.4S, v6.S[1] // *................. - // gap // .................. - // gap // .................. - // gap // .................. - mul v26.4S, v0.4S, v6.S[0] // .........*........ - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - mul v1.4S, v5.4S, v6.S[0] // .*................ - sub v5.4S, v27.4S, v18.4S // ....*............. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - mls v1.4S, v19.4S, v8.S[0] // .....*............ - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - mul v0.4S, v5.4S, v6.S[0] // .......*.......... - // gap // .................. - // gap // .................. - // gap // .................. - sqrdmulh v28.4S, v2.4S, v6.S[1] // ......*........... - // gap // .................. - // gap // .................. - str q1, [x2], #(16*4) // .............*.... - // gap // .................. - // gap // .................. - sqrdmulh v5.4S, v5.4S, v6.S[1] // ........*......... - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - mls v26.4S, v21.4S, v8.S[0] // ............*..... - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - mls v13.4S, v28.4S, v8.S[0] // ..........*....... - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - mls v0.4S, v5.4S, v8.S[0] // ...........*...... - // gap // .................. - // gap // .................. - str q26, [x2, #-32] // ................*. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - str q13, [x2, #-48] // ..............*... - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - str q0, [x2, #-16] // ...............*.. - add x2, x2, #64 // .................* - // gap // .................. - - // original source code - // sqrdmulh v9.4S, v0.4S, v6.S[1] // ..*............... - // mul v15.4S, v5.4S, v6.S[0] // ....*............. - // str q29, [x1, #-16] // *................. - // add x1, x1, #64 // .*................ - // sub v5.4S, v27.4S, v18.4S // .....*............ - // mls v15.4S, v19.4S, v8.S[0] // ......*........... - // sqrdmulh v2.4S, v2.4S, v6.S[1] // ........*......... - // mul v19.4S, v5.4S, v6.S[0] // .......*.......... - // sqrdmulh v29.4S, v5.4S, v6.S[1] // ..........*....... - // mul v5.4S, v0.4S, v6.S[0] // ...*.............. - // mls v13.4S, v2.4S, v8.S[0] // ............*..... - // mls v19.4S, v29.4S, v8.S[0] // .............*.... - // mls v5.4S, v9.4S, v8.S[0] // ...........*...... - // str q15, [x2], #(16*4) // .........*........ - // str q13, [x2, #-48] // ...............*.. - // str q19, [x2, #-16] // ................*. - // str q5, [x2, #-32] // ..............*... - // add x2, x2, #64 // .................* + // Instructions: 144 + // Expected cycles: 124 + // Expected IPC: 1.16 + // + // Wall time: 86.82s + // User time: 86.82s + // + // -------------------------------------------------------------- original position --------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + trn1 v11.4S, v18.4S, v0.4S // .*.............................................................................................................................................. + mls v25.4S, v9.4S, v8.S[0] // ...*............................................................................................................................................ + ldr q27, [x2, #0] // *............................................................................................................................................... + ldr q5, [x4, #16] // ..................*............................................................................................................................. + add v21.4S, v1.4S, v24.4S // ....*........................................................................................................................................... + ldr q13, [x5], #(12*16) // ..*............................................................................................................................................. + trn2 v29.4S, v18.4S, v0.4S // ..............*................................................................................................................................. + ldr q20, [x5, #-96] // .....*.......................................................................................................................................... + ldr q4, [x4], #64 // ...............*................................................................................................................................ + sqrdmulh v7.4S, v16.4S, v30.4S // .......*........................................................................................................................................ + ldr q10, [x4, #-16] // .......................*........................................................................................................................ + ldr q26, [x4, #-32] // ...................*............................................................................................................................ + ldr q2, [x5, #-64] // ........*....................................................................................................................................... + trn1 v18.4S, v27.4S, v17.4S // ..........*..................................................................................................................................... + // gap // ................................................................................................................................................ + ldr q15, [x5, #-48] // .........*...................................................................................................................................... + mul v23.4S, v16.4S, v13.4S // ................*............................................................................................................................... + sub v28.4S, v25.4S, v14.4S // .................*.............................................................................................................................. + ldr q19, [x5, #-32] // ...........*.................................................................................................................................... + trn2 v24.4S, v27.4S, v17.4S // .............*.................................................................................................................................. + // gap // ................................................................................................................................................ + trn1 v12.2D, v18.2D, v11.2D // ........................*....................................................................................................................... + trn2 v11.2D, v18.2D, v11.2D // ....................*........................................................................................................................... + ldr q31, [x5, #-16] // ............*................................................................................................................................... + sqrdmulh v27.4S, v28.4S, v30.4S // .............................*.................................................................................................................. + ldr q1, [x5, #-80] // ......*......................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v9.4S, v28.4S, v13.4S // .........................*...................................................................................................................... + // gap // ................................................................................................................................................ + trn2 v0.2D, v24.2D, v29.2D // ......................*......................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v23.4S, v7.4S, v8.S[0] // .....................*.......................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v18.4S, v11.4S, v0.4S // ...........................*.................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v9.4S, v27.4S, v8.S[0] // .....................................*.......................................................................................................... + trn1 v27.2D, v24.2D, v29.2D // ............................*................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v16.4S, v18.4S, v31.4S // ..................................*............................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v31.4S, v12.4S, v27.4S // .................................*.............................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v3.4S, v12.4S, v27.4S // ...................................*............................................................................................................ + mul v18.4S, v18.4S, v19.4S // ................................*............................................................................................................... + // gap // ................................................................................................................................................ + add v12.4S, v25.4S, v14.4S // ..........................*..................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v28.4S, v31.4S, v15.4S // .........................................*...................................................................................................... + add v27.4S, v11.4S, v0.4S // ....................................*........................................................................................................... + // gap // ................................................................................................................................................ + trn1 v29.4S, v23.4S, v9.4S // ..........................................*..................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v11.4S, v23.4S, v9.4S // ...........................................*.................................................................................................... + mul v14.4S, v31.4S, v2.4S // ......................................*......................................................................................................... + // gap // ................................................................................................................................................ + sub v0.4S, v3.4S, v27.4S // .......................................*........................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v18.4S, v16.4S, v8.S[0] // ............................................*................................................................................................... + trn2 v13.4S, v21.4S, v12.4S // ...............................*................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v22.4S, v21.4S, v12.4S // ..............................*................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v12.4S, v0.4S, v1.4S // .............................................*.................................................................................................. + add v30.4S, v3.4S, v27.4S // ........................................*....................................................................................................... + // gap // ................................................................................................................................................ + trn1 v31.2D, v13.2D, v11.2D // .................................................*.............................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v2.2D, v22.2D, v29.2D // ...............................................*................................................................................................ + mls v14.4S, v28.4S, v8.S[0] // ................................................*............................................................................................... + // gap // ................................................................................................................................................ + trn2 v7.2D, v22.2D, v29.2D // ..............................................*................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v25.2D, v13.2D, v11.2D // ..................................................*............................................................................................. + mul v9.4S, v0.4S, v20.4S // ...................................................*............................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v11.4S, v2.4S, v31.4S // ....................................................*........................................................................................... + mls v9.4S, v12.4S, v8.S[0] // .....................................................*.......................................................................................... + add v23.4S, v2.4S, v31.4S // ......................................................*......................................................................................... + // gap // ................................................................................................................................................ + add v28.4S, v14.4S, v18.4S // ........................................................*....................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v2.4S, v11.4S, v5.S[2] // .........................................................*...................................................................................... + add v0.4S, v7.4S, v25.4S // ............................................................*................................................................................... + // gap // ................................................................................................................................................ + sub v21.4S, v7.4S, v25.4S // ..........................................................*..................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v29.4S, v30.4S, v28.4S // .............................................................*.................................................................................. + // gap // ................................................................................................................................................ + sqrdmulh v12.4S, v11.4S, v5.S[3] // .................................................................*.............................................................................. + sub v3.4S, v23.4S, v0.4S // ................................................................*............................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v17.4S, v23.4S, v0.4S // ..................................................................*............................................................................. + mul v22.4S, v21.4S, v26.S[0] // ...................................................................*............................................................................ + // gap // ................................................................................................................................................ + trn2 v11.4S, v30.4S, v28.4S // ...............................................................*................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v15.4S, v14.4S, v18.4S // .......................................................*........................................................................................ + mul v6.4S, v3.4S, v4.S[2] // ..........................................................................*..................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + srshr v27.4S, v17.4S, #23 // ....................................................................*........................................................................... + mls v2.4S, v12.4S, v8.S[0] // ......................................................................*......................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v24.4S, v21.4S, v26.S[1] // .......................................................................*........................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v16.4S, v15.4S, v1.4S // ..............................................................*................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v30.4S, v15.4S, v20.4S // ...........................................................*.................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v22.4S, v24.4S, v8.S[0] // ..............................................................................*................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v30.4S, v16.4S, v8.S[0] // .....................................................................*.......................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v21.4S, v3.4S, v4.S[3] // ...........................................................................*.................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v18.4S, v2.4S, v22.4S // ........................................................................................*....................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v16.4S, v2.4S, v22.4S // ..........................................................................................*..................................................... + mls v17.4S, v27.4S, v8.4S // .................................................................................*.............................................................. + // gap // ................................................................................................................................................ + trn1 v31.4S, v9.4S, v30.4S // ........................................................................*....................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v13.4S, v18.4S, v4.S[3] // ...........................................................................................................*.................................... + trn2 v30.4S, v9.4S, v30.4S // .........................................................................*...................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + srshr v27.4S, v16.4S, #23 // ..............................................................................................*................................................. + mls v6.4S, v21.4S, v8.S[0] // ...................................................................................*............................................................ + trn2 v23.2D, v29.2D, v31.2D // ............................................................................*................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v24.2D, v11.2D, v30.2D // ................................................................................*............................................................... + trn1 v21.2D, v29.2D, v31.2D // .............................................................................*.................................................................. + // gap // ................................................................................................................................................ + mul v3.4S, v18.4S, v4.S[2] // ...................................................................................................*............................................ + trn1 v15.2D, v11.2D, v30.2D // ...............................................................................*................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v22.4S, v23.4S, v24.4S // ......................................................................................*......................................................... + mls v16.4S, v27.4S, v8.4S // .................................................................................................................*.............................. + sub v28.4S, v23.4S, v24.4S // .....................................................................................*.......................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v3.4S, v13.4S, v8.S[0] // ..............................................................................................................*................................. + add v13.4S, v21.4S, v15.4S // ....................................................................................*........................................................... + // gap // ................................................................................................................................................ + sub v20.4S, v21.4S, v15.4S // ..................................................................................*............................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v30.4S, v28.4S, v10.S[1] // .............................................................................................*.................................................. + sub v14.4S, v13.4S, v22.4S // ...........................................................................................*.................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v1.4S, v28.4S, v10.S[0] // ...............................................................................................*................................................ + add v29.4S, v13.4S, v22.4S // ............................................................................................*................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v22.4S, v20.4S, v26.S[3] // .........................................................................................*...................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v2.4S, v20.4S, v26.S[2] // .......................................................................................*........................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v1.4S, v30.4S, v8.S[0] // ..................................................................................................*............................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v2.4S, v22.4S, v8.S[0] // .................................................................................................*.............................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + srshr v21.4S, v29.4S, #23 // ................................................................................................*............................................... + sqrdmulh v31.4S, v14.4S, v5.S[1] // ......................................................................................................*......................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v29.4S, v21.4S, v8.4S // ..................................................................................................................*............................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v12.4S, v2.4S, v1.4S // .....................................................................................................*.......................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v28.4S, v14.4S, v5.S[0] // ....................................................................................................*........................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v11.4S, v12.4S, v5.S[1] // ..........................................................................................................*..................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v27.4S, v17.4S, v29.4S // ..........................................................................................................................*..................... + mul v10.4S, v12.4S, v5.S[0] // ........................................................................................................*....................................... + add v12.4S, v2.4S, v1.4S // .......................................................................................................*........................................ + // gap // ................................................................................................................................................ + sub v19.4S, v17.4S, v29.4S // .........................................................................................................................*...................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v28.4S, v31.4S, v8.S[0] // ............................................................................................................*................................... + str q27, [x1], #(16*4) // ...............................................................................................................................*................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + srshr v1.4S, v12.4S, #23 // .........................................................................................................*...................................... + mls v10.4S, v11.4S, v8.S[0] // .............................................................................................................*.................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v12.4S, v1.4S, v8.4S // .....................................................................................................................*.......................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v31.4S, v6.4S, v28.4S // ...............................................................................................................*................................ + add v27.4S, v6.4S, v28.4S // ................................................................................................................*............................... + // gap // ................................................................................................................................................ + mul v21.4S, v19.4S, v4.S[0] // ..............................................................................................................................*................. + sub v25.4S, v3.4S, v10.4S // ....................................................................................................................*........................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v28.4S, v3.4S, v10.4S // ......................................................................................................................*......................... + mul v30.4S, v31.4S, v4.S[0] // .......................................................................................................................*........................ + // gap // ................................................................................................................................................ + str q27, [x1, #-32] // ...................................................................................................................*............................ + // gap // ................................................................................................................................................ + add v27.4S, v16.4S, v12.4S // ............................................................................................................................*................... + sub v29.4S, v16.4S, v12.4S // .............................................................................................................................*.................. + mul v0.4S, v25.4S, v4.S[0] // .....................................................................................................................................*.......... + // gap // ................................................................................................................................................ + str q28, [x1, #-16] // ........................................................................................................................*....................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q27, [x1, #-48] // .................................................................................................................................*.............. + add x1, x1, #64 // ..................................................................................................................................*............. + sqrdmulh v12.4S, v25.4S, v4.S[1] // ......................................................................................................................................*......... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v20.4S, v29.4S, v4.S[1] // ....................................................................................................................................*........... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v1.4S, v31.4S, v4.S[1] // ...........................................................................................................................*.................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v9.4S, v19.4S, v4.S[1] // ................................................................................................................................*............... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v14.4S, v29.4S, v4.S[0] // ...................................................................................................................................*............ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v14.4S, v20.4S, v8.S[0] // ........................................................................................................................................*....... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v30.4S, v1.4S, v8.S[0] // .........................................................................................................................................*...... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v21.4S, v9.4S, v8.S[0] // .......................................................................................................................................*........ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q14, [x2, #16] // ............................................................................................................................................*... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v0.4S, v12.4S, v8.S[0] // ...........................................................................................................................................*.... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q30, [x2, #32] // .............................................................................................................................................*.. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q21, [x2], #(16*4) // ..........................................................................................................................................*..... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q0, [x2, #-16] // ..............................................................................................................................................*. + add x2, x2, #64 // ...............................................................................................................................................* + // gap // ................................................................................................................................................ + + // ---------------------------------------------------------------- new position -----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + // ldr q27, [x2, #0] // ..*............................................................................................................................................. + // trn1 v11.4S, v18.4S, v0.4S // *............................................................................................................................................... + // ldr q23, [x5], #(12*16) // .....*.......................................................................................................................................... + // mls v25.4S, v9.4S, v8.S[0] // .*.............................................................................................................................................. + // add v24.4S, v1.4S, v24.4S // ....*........................................................................................................................................... + // ldr q9, [x5, #-96] // .......*........................................................................................................................................ + // ldr q13, [x5, #-80] // .......................*........................................................................................................................ + // sqrdmulh v1.4S, v16.4S, v30.4S // .........*...................................................................................................................................... + // ldr q3, [x5, #-64] // ............*................................................................................................................................... + // ldr q20, [x5, #-48] // ..............*................................................................................................................................. + // trn1 v21.4S, v27.4S, v17.4S // .............*.................................................................................................................................. + // ldr q15, [x5, #-32] // .................*.............................................................................................................................. + // ldr q26, [x5, #-16] // .....................*.......................................................................................................................... + // trn2 v27.4S, v27.4S, v17.4S // ..................*............................................................................................................................. + // trn2 v18.4S, v18.4S, v0.4S // ......*......................................................................................................................................... + // ldr q0, [x4], #64 // ........*....................................................................................................................................... + // mul v16.4S, v16.4S, v23.4S // ...............*................................................................................................................................ + // sub v17.4S, v25.4S, v14.4S // ................*............................................................................................................................... + // ldr q2, [x4, #-48] // ...*............................................................................................................................................ + // ldr q4, [x4, #-32] // ...........*.................................................................................................................................... + // trn2 v5.2D, v21.2D, v11.2D // ....................*........................................................................................................................... + // mls v16.4S, v1.4S, v8.S[0] // ..........................*..................................................................................................................... + // trn2 v1.2D, v27.2D, v18.2D // .........................*...................................................................................................................... + // ldr q22, [x4, #-16] // ..........*..................................................................................................................................... + // trn1 v11.2D, v21.2D, v11.2D // ...................*............................................................................................................................ + // mul v23.4S, v17.4S, v23.4S // ........................*....................................................................................................................... + // add v25.4S, v25.4S, v14.4S // ..................................*............................................................................................................. + // sub v19.4S, v5.4S, v1.4S // ...........................*.................................................................................................................... + // trn1 v27.2D, v27.2D, v18.2D // .............................*.................................................................................................................. + // sqrdmulh v18.4S, v17.4S, v30.4S // ......................*......................................................................................................................... + // trn1 v17.4S, v24.4S, v25.4S // ...........................................*.................................................................................................... + // trn2 v25.4S, v24.4S, v25.4S // ..........................................*..................................................................................................... + // mul v24.4S, v19.4S, v15.4S // .................................*.............................................................................................................. + // sub v15.4S, v11.4S, v27.4S // ...............................*................................................................................................................ + // sqrdmulh v26.4S, v19.4S, v26.4S // ..............................*................................................................................................................. + // add v27.4S, v11.4S, v27.4S // ................................*............................................................................................................... + // add v11.4S, v5.4S, v1.4S // ....................................*........................................................................................................... + // mls v23.4S, v18.4S, v8.S[0] // ............................*................................................................................................................... + // mul v1.4S, v15.4S, v3.4S // .......................................*........................................................................................................ + // sub v3.4S, v27.4S, v11.4S // ........................................*....................................................................................................... + // add v27.4S, v27.4S, v11.4S // .............................................*.................................................................................................. + // sqrdmulh v11.4S, v15.4S, v20.4S // ...................................*............................................................................................................ + // trn1 v20.4S, v16.4S, v23.4S // .....................................*.......................................................................................................... + // trn2 v23.4S, v16.4S, v23.4S // ......................................*......................................................................................................... + // mls v24.4S, v26.4S, v8.S[0] // .........................................*...................................................................................................... + // sqrdmulh v15.4S, v3.4S, v13.4S // ............................................*................................................................................................... + // trn2 v26.2D, v17.2D, v20.2D // .................................................*.............................................................................................. + // trn1 v20.2D, v17.2D, v20.2D // ...............................................*................................................................................................ + // mls v1.4S, v11.4S, v8.S[0] // ................................................*............................................................................................... + // trn1 v11.2D, v25.2D, v23.2D // ..............................................*................................................................................................. + // trn2 v23.2D, v25.2D, v23.2D // ..................................................*............................................................................................. + // mul v25.4S, v3.4S, v9.4S // ...................................................*............................................................................................ + // sub v3.4S, v20.4S, v11.4S // ....................................................*........................................................................................... + // mls v25.4S, v15.4S, v8.S[0] // .....................................................*.......................................................................................... + // add v11.4S, v20.4S, v11.4S // ......................................................*......................................................................................... + // sub v20.4S, v1.4S, v24.4S // .................................................................*.............................................................................. + // add v24.4S, v1.4S, v24.4S // .......................................................*........................................................................................ + // mul v1.4S, v3.4S, v2.S[2] // ........................................................*....................................................................................... + // sub v15.4S, v26.4S, v23.4S // ..........................................................*..................................................................................... + // mul v9.4S, v20.4S, v9.4S // .......................................................................*........................................................................ + // add v23.4S, v26.4S, v23.4S // .........................................................*...................................................................................... + // trn1 v26.4S, v27.4S, v24.4S // ...........................................................*.................................................................................... + // sqrdmulh v13.4S, v20.4S, v13.4S // ......................................................................*......................................................................... + // trn2 v27.4S, v27.4S, v24.4S // ................................................................*............................................................................... + // sub v24.4S, v11.4S, v23.4S // .............................................................*.................................................................................. + // sqrdmulh v3.4S, v3.4S, v2.S[3] // ............................................................*................................................................................... + // add v11.4S, v11.4S, v23.4S // ..............................................................*................................................................................. + // mul v23.4S, v15.4S, v4.S[0] // ...............................................................*................................................................................ + // srshr v20.4S, v11.4S, #23 // ...................................................................*............................................................................ + // mls v9.4S, v13.4S, v8.S[0] // .........................................................................*...................................................................... + // mls v1.4S, v3.4S, v8.S[0] // ....................................................................*........................................................................... + // sqrdmulh v13.4S, v15.4S, v4.S[1] // .....................................................................*.......................................................................... + // trn1 v3.4S, v25.4S, v9.4S // ..............................................................................*................................................................. + // trn2 v25.4S, v25.4S, v9.4S // ................................................................................*............................................................... + // mul v9.4S, v24.4S, v0.S[2] // ..................................................................*............................................................................. + // sqrdmulh v24.4S, v24.4S, v0.S[3] // ..........................................................................*..................................................................... + // trn2 v15.2D, v26.2D, v3.2D // ...................................................................................*............................................................ + // trn1 v3.2D, v26.2D, v3.2D // .....................................................................................*.......................................................... + // mls v23.4S, v13.4S, v8.S[0] // ........................................................................*....................................................................... + // trn1 v13.2D, v27.2D, v25.2D // .......................................................................................*........................................................ + // trn2 v27.2D, v27.2D, v25.2D // ....................................................................................*........................................................... + // mls v11.4S, v20.4S, v8.4S // .............................................................................*.................................................................. + // sub v25.4S, v3.4S, v13.4S // .............................................................................................*.................................................. + // mls v9.4S, v24.4S, v8.S[0] // ..................................................................................*............................................................. + // add v24.4S, v3.4S, v13.4S // ............................................................................................*................................................... + // sub v13.4S, v15.4S, v27.4S // ..........................................................................................*..................................................... + // add v27.4S, v15.4S, v27.4S // ........................................................................................*....................................................... + // mul v3.4S, v25.4S, v4.S[2] // ...................................................................................................*............................................ + // sub v20.4S, v1.4S, v23.4S // ...........................................................................*.................................................................... + // sqrdmulh v25.4S, v25.4S, v4.S[3] // ..................................................................................................*............................................. + // add v23.4S, v1.4S, v23.4S // ............................................................................*................................................................... + // sub v1.4S, v24.4S, v27.4S // ...............................................................................................*................................................ + // add v27.4S, v24.4S, v27.4S // .................................................................................................*.............................................. + // sqrdmulh v24.4S, v13.4S, v22.S[1] // ..............................................................................................*................................................. + // srshr v15.4S, v23.4S, #23 // .................................................................................*.............................................................. + // mul v13.4S, v13.4S, v22.S[0] // ................................................................................................*............................................... + // srshr v26.4S, v27.4S, #23 // ......................................................................................................*......................................... + // mls v3.4S, v25.4S, v8.S[0] // .....................................................................................................*.......................................... + // mls v13.4S, v24.4S, v8.S[0] // ....................................................................................................*........................................... + // mul v25.4S, v20.4S, v0.S[2] // ......................................................................................*......................................................... + // mul v24.4S, v1.4S, v2.S[0] // ..........................................................................................................*..................................... + // sub v18.4S, v3.4S, v13.4S // .........................................................................................................*...................................... + // sqrdmulh v1.4S, v1.4S, v2.S[1] // .......................................................................................................*........................................ + // add v13.4S, v3.4S, v13.4S // ..............................................................................................................*................................. + // mul v3.4S, v18.4S, v2.S[0] // .............................................................................................................*.................................. + // srshr v16.4S, v13.4S, #23 // ..................................................................................................................*............................. + // sqrdmulh v18.4S, v18.4S, v2.S[1] // ...........................................................................................................*.................................... + // sqrdmulh v20.4S, v20.4S, v0.S[3] // ...............................................................................*................................................................ + // mls v24.4S, v1.4S, v8.S[0] // ................................................................................................................*............................... + // mls v3.4S, v18.4S, v8.S[0] // ...................................................................................................................*............................ + // mls v25.4S, v20.4S, v8.S[0] // ...........................................................................................*.................................................... + // sub v1.4S, v9.4S, v24.4S // .....................................................................................................................*.......................... + // add v24.4S, v9.4S, v24.4S // ......................................................................................................................*......................... + // mls v23.4S, v15.4S, v8.4S // .........................................................................................*...................................................... + // mls v27.4S, v26.4S, v8.4S // ........................................................................................................*....................................... + // str q24, [x1, #32] // ...........................................................................................................................*.................... + // sub v24.4S, v25.4S, v3.4S // ........................................................................................................................*....................... + // mls v13.4S, v16.4S, v8.4S // ....................................................................................................................*........................... + // add v25.4S, v25.4S, v3.4S // .........................................................................................................................*...................... + // mul v9.4S, v1.4S, v0.S[0] // ..........................................................................................................................*..................... + // str q25, [x1, #48] // ...............................................................................................................................*................ + // sub v25.4S, v11.4S, v27.4S // ...............................................................................................................*................................ + // add v27.4S, v11.4S, v27.4S // ............................................................................................................*................................... + // sqrdmulh v11.4S, v1.4S, v0.S[1] // ....................................................................................................................................*........... + // add v1.4S, v23.4S, v13.4S // ............................................................................................................................*................... + // sub v23.4S, v23.4S, v13.4S // .............................................................................................................................*.................. + // mul v13.4S, v25.4S, v0.S[0] // .......................................................................................................................*........................ + // str q27, [x1], #(16*4) // .................................................................................................................*.............................. + // sqrdmulh v27.4S, v25.4S, v0.S[1] // .....................................................................................................................................*.......... + // str q1, [x1, #-48] // ................................................................................................................................*............... + // add x1, x1, #64 // .................................................................................................................................*.............. + // mul v3.4S, v23.4S, v0.S[0] // ......................................................................................................................................*......... + // sqrdmulh v23.4S, v23.4S, v0.S[1] // ...................................................................................................................................*............ + // mul v18.4S, v24.4S, v0.S[0] // ..............................................................................................................................*................. + // sqrdmulh v24.4S, v24.4S, v0.S[1] // ..................................................................................................................................*............. + // mls v13.4S, v27.4S, v8.S[0] // .........................................................................................................................................*...... + // mls v3.4S, v23.4S, v8.S[0] // .......................................................................................................................................*........ + // mls v9.4S, v11.4S, v8.S[0] // ........................................................................................................................................*....... + // str q13, [x2], #(16*4) // .............................................................................................................................................*.. + // mls v18.4S, v24.4S, v8.S[0] // ...........................................................................................................................................*.... + // str q3, [x2, #-48] // ..........................................................................................................................................*..... + // str q9, [x2, #-32] // ............................................................................................................................................*... + // str q18, [x2, #-16] // ..............................................................................................................................................*. + // add x2, x2, #64 // ...............................................................................................................................................* // ----------------------------------------------------------------------------- @@ -1594,7 +1625,7 @@ layer45678_start: ASM_LOAD(xtmp, ninv_tw_addr) ld1r {ninv_tw.4s}, [xtmp] - ushr modulus_half.4S, modulus.4S, #1 + ushr modulus_half.4S, consts.4S, #1 neg neg_modulus_half.4S, modulus_half.4S mov count, #8 @@ -1602,960 +1633,996 @@ layer45678_start: load_roots_123 .p2align 2 - ldr q5, [x0, #768] // .*........... - ldr q19, [x0, #896] // .....*....... - // gap // ............. - ldr q13, [x0, #256] // *............ - // gap // ............. - // gap // ............. - ldr q17, [x0, #384] // ..*.......... - // gap // ............. - // gap // ............. - ldr q22, [x0, #512] // ...*......... - // gap // ............. - // gap // ............. - add v23.4S, v5.4S, v19.4S // .......*..... - sub v5.4S, v5.4S, v19.4S // ......*...... - ldr q4, [x0, #640] // ....*........ - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - sqrdmulh v19.4S, v5.4S, v3.S[1] // .........*... - // gap // ............. - // gap // ............. - sub v20.4S, v22.4S, v4.4S // ........*.... - // gap // ............. - // gap // ............. - mul v21.4S, v5.4S, v3.S[0] // ..........*.. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - sqrdmulh v11.4S, v20.4S, v2.S[3] // ...........*. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - mls v21.4S, v19.4S, v8.S[0] // ............* - // gap // ............. - // gap // ............. - - // original source code - // ldr q13, [x0, #256] // ..*.......... - // ldr q27, [x0, #768] // *............ - // ldr q17, [x0, #384] // ...*......... - // ldr q22, [x0, #512] // ....*........ - // ldr q4, [x0, #640] // .......*..... - // ldr q20, [x0, #896] // .*........... - // sub v6.4S, v27.4S, v20.4S // ......*...... - // add v23.4S, v27.4S, v20.4S // .....*....... - // sub v20.4S, v22.4S, v4.4S // .........*... - // sqrdmulh v15.4S, v6.4S, v3.S[1] // ........*.... - // mul v21.4S, v6.4S, v3.S[0] // ..........*.. - // sqrdmulh v11.4S, v20.4S, v2.S[3] // ...........*. - // mls v21.4S, v15.4S, v8.S[0] // ............* + // Instructions: 13 + // Expected cycles: 14 + // Expected IPC: 0.93 + // + // Wall time: 0.04s + // User time: 0.04s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q11, [x0, #768] // *............................. + ldr q4, [x0, #896] // .....*........................ + // gap // .............................. + ldr q13, [x0, #256] // .*............................ + // gap // .............................. + // gap // .............................. + ldr q20, [x0, #384] // ..*........................... + // gap // .............................. + // gap // .............................. + ldr q18, [x0, #512] // ...*.......................... + // gap // .............................. + // gap // .............................. + sub v28.4S, v11.4S, v4.4S // ......*....................... + ldr q16, [x0, #640] // ....*......................... + // gap // .............................. + add v14.4S, v11.4S, v4.4S // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v6.4S, v28.4S, v3.S[1] // .........*.................... + // gap // .............................. + // gap // .............................. + sub v5.4S, v18.4S, v16.4S // ........*..................... + // gap // .............................. + // gap // .............................. + mul v19.4S, v28.4S, v3.S[0] // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v4.4S, v5.4S, v2.S[3] // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v19.4S, v6.4S, v8.S[0] // ............*................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q21, [x0, #768] // *.............................. + // ldr q13, [x0, #256] // ..*............................ + // ldr q20, [x0, #384] // ...*........................... + // ldr q18, [x0, #512] // ....*.......................... + // ldr q16, [x0, #640] // ......*........................ + // ldr q17, [x0, #896] // .*............................. + // sub v22.4S, v21.4S, v17.4S // .....*......................... + // add v14.4S, v21.4S, v17.4S // .......*....................... + // sub v5.4S, v18.4S, v16.4S // .........*..................... + // sqrdmulh v24.4S, v22.4S, v3.S[1] // ........*...................... + // mul v19.4S, v22.4S, v3.S[0] // ..........*.................... + // sqrdmulh v4.4S, v5.4S, v2.S[3] // ...........*................... + // mls v19.4S, v24.4S, v8.S[0] // ............*.................. sub count, count, #1 layer123_start: - ldr q19, [x0, #0] // *....................................................................................................................... - ldr q5, [x0, #128] // .*...................................................................................................................... - sub v9.4S, v13.4S, v17.4S // .............*.......................................................................................................... - add v15.4S, v13.4S, v17.4S // ..............*......................................................................................................... - mul v12.4S, v20.4S, v2.S[2] // ....................*................................................................................................... + // Instructions: 120 + // Expected cycles: 112 + // Expected IPC: 1.07 + // + // Wall time: 13.14s + // User time: 13.14s + // + // -------------------------------------------------- original position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------------- + ldr q27, [x0, #0] // *....................................................................................................................... + ldr q11, [x0, #128] // .*...................................................................................................................... + sub v7.4S, v13.4S, v20.4S // .............*.......................................................................................................... + mul v24.4S, v5.4S, v2.S[2] // ....................*................................................................................................... + add v23.4S, v13.4S, v20.4S // ..............*......................................................................................................... + ldr q21, [x0, #784] // ......e................................................................................................................. + add v9.4S, v18.4S, v16.4S // ...................*.................................................................................................... ldr q13, [x0, #272] // ..e..................................................................................................................... - add v18.4S, v22.4S, v4.4S // ...................*.................................................................................................... - ldr q27, [x0, #784] // ......e................................................................................................................. - ldr q17, [x0, #400] // ...e.................................................................................................................... - mul v28.4S, v9.4S, v2.S[0] // ...............*........................................................................................................ - ldr q22, [x0, #528] // ....e................................................................................................................... - ldr q4, [x0, #656] // .....e.................................................................................................................. - add v16.4S, v19.4S, v5.4S // .........*.............................................................................................................. - ldr q20, [x0, #912] // .......e................................................................................................................ + ldr q20, [x0, #400] // ...e.................................................................................................................... + mul v15.4S, v7.4S, v2.S[0] // ...............*........................................................................................................ + ldr q18, [x0, #528] // ....e................................................................................................................... + ldr q16, [x0, #656] // .....e.................................................................................................................. + ldr q17, [x0, #912] // .......e................................................................................................................ + sub v5.4S, v27.4S, v11.4S // ........*............................................................................................................... // gap // ........................................................................................................................ - sub v19.4S, v19.4S, v5.4S // ........*............................................................................................................... - sqrdmulh v5.4S, v9.4S, v2.S[1] // ................*....................................................................................................... + add v27.4S, v27.4S, v11.4S // .........*.............................................................................................................. + sqrdmulh v11.4S, v7.4S, v2.S[1] // ................*....................................................................................................... // gap // ........................................................................................................................ - sub v9.4S, v18.4S, v23.4S // ......................................*................................................................................. + sub v7.4S, v9.4S, v14.4S // ......................................*................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v12.4S, v11.4S, v8.S[0] // ......................*................................................................................................. - sub v11.4S, v16.4S, v15.4S // ............................*........................................................................................... + mls v24.4S, v4.4S, v8.S[0] // ......................*................................................................................................. + add v9.4S, v9.4S, v14.4S // .......................................*................................................................................ // gap // ........................................................................................................................ - add v15.4S, v16.4S, v15.4S // .............................*.......................................................................................... + sub v4.4S, v27.4S, v23.4S // ............................*........................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v18.4S, v18.4S, v23.4S // .......................................*................................................................................ - mul v16.4S, v19.4S, v1.S[2] // ..........*............................................................................................................. + add v27.4S, v27.4S, v23.4S // .............................*.......................................................................................... + mul v23.4S, v5.4S, v1.S[2] // ..........*............................................................................................................. // gap // ........................................................................................................................ - sub v6.4S, v27.4S, v20.4S // .......................e................................................................................................ + sub v22.4S, v21.4S, v17.4S // .......................e................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v23.4S, v27.4S, v20.4S // ........................e............................................................................................... - mls v28.4S, v5.4S, v8.S[0] // .................*...................................................................................................... + add v14.4S, v21.4S, v17.4S // ........................e............................................................................................... + mls v15.4S, v11.4S, v8.S[0] // .................*...................................................................................................... // gap // ........................................................................................................................ - sub v5.4S, v12.4S, v21.4S // ...........................................*............................................................................ + sub v11.4S, v24.4S, v19.4S // ...........................................*............................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v19.4S, v19.4S, v1.S[3] // ...........*............................................................................................................ - add v12.4S, v12.4S, v21.4S // ............................................*........................................................................... + sqrdmulh v21.4S, v5.4S, v1.S[3] // ...........*............................................................................................................ + add v24.4S, v24.4S, v19.4S // ............................................*........................................................................... // gap // ........................................................................................................................ - sub v27.4S, v15.4S, v18.4S // ................................................*....................................................................... + sub v17.4S, v27.4S, v9.4S // ................................................*....................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v15.4S, v15.4S, v18.4S // .................................................*...................................................................... - mul v18.4S, v11.4S, v0.S[2] // ..............................*......................................................................................... + add v27.4S, v27.4S, v9.4S // .................................................*...................................................................... + mul v9.4S, v4.4S, v0.S[2] // ..............................*......................................................................................... // gap // ........................................................................................................................ - sub v20.4S, v22.4S, v4.4S // ..................e..................................................................................................... + sub v5.4S, v18.4S, v16.4S // ..................e..................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v11.4S, v11.4S, v0.S[3] // ...............................*........................................................................................ + sqrdmulh v4.4S, v4.4S, v0.S[3] // ...............................*........................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v21.4S, v9.4S, v1.S[0] // ........................................*............................................................................... + mul v19.4S, v7.4S, v1.S[0] // ........................................*............................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v9.4S, v9.4S, v1.S[1] // .........................................*.............................................................................. + sqrdmulh v7.4S, v7.4S, v1.S[1] // .........................................*.............................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v16.4S, v19.4S, v8.S[0] // ............*........................................................................................................... + mls v23.4S, v21.4S, v8.S[0] // ............*........................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v18.4S, v11.4S, v8.S[0] // ................................*....................................................................................... + mls v9.4S, v4.4S, v8.S[0] // ................................*....................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v21.4S, v9.4S, v8.S[0] // ..........................................*............................................................................. + mls v19.4S, v7.4S, v8.S[0] // ..........................................*............................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v19.4S, v16.4S, v28.4S // .................................*...................................................................................... + sub v7.4S, v23.4S, v15.4S // .................................*...................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v9.4S, v16.4S, v28.4S // ..................................*..................................................................................... - mul v28.4S, v5.4S, v1.S[0] // .............................................*.......................................................................... + add v23.4S, v23.4S, v15.4S // ..................................*..................................................................................... + mul v21.4S, v11.4S, v1.S[0] // .............................................*.......................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v16.4S, v19.4S, v0.S[2] // ...................................*.................................................................................... + mul v15.4S, v7.4S, v0.S[2] // ...................................*.................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v11.4S, v9.4S, v12.4S // .....................................................*.................................................................. + sub v4.4S, v23.4S, v24.4S // .....................................................*.................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v9.4S, v9.4S, v12.4S // ......................................................*................................................................. - sqrdmulh v19.4S, v19.4S, v0.S[3] // ....................................*................................................................................... + add v23.4S, v23.4S, v24.4S // ......................................................*................................................................. + sqrdmulh v7.4S, v7.4S, v0.S[3] // ....................................*................................................................................... // gap // ........................................................................................................................ - sub v12.4S, v18.4S, v21.4S // ..........................................................*............................................................. + sub v24.4S, v9.4S, v19.4S // ..........................................................*............................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v5.4S, v5.4S, v1.S[1] // ..............................................*......................................................................... - add v18.4S, v18.4S, v21.4S // ...........................................................*............................................................ + sqrdmulh v11.4S, v11.4S, v1.S[1] // ..............................................*......................................................................... + add v9.4S, v9.4S, v19.4S // ...........................................................*............................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v21.4S, v27.4S, v0.S[0] // ..................................................*..................................................................... + mul v19.4S, v17.4S, v0.S[0] // ..................................................*..................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v16.4S, v19.4S, v8.S[0] // .....................................*.................................................................................. + mls v15.4S, v7.4S, v8.S[0] // .....................................*.................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v28.4S, v5.4S, v8.S[0] // ...............................................*........................................................................ + mls v21.4S, v11.4S, v8.S[0] // ...............................................*........................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v19.4S, v27.4S, v0.S[1] // ...................................................*.................................................................... + sqrdmulh v11.4S, v17.4S, v0.S[1] // ...................................................*.................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v5.4S, v11.4S, v0.S[0] // .......................................................*................................................................ + mul v7.4S, v4.4S, v0.S[0] // .......................................................*................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v27.4S, v16.4S, v28.4S // ...............................................................*........................................................ + sub v17.4S, v15.4S, v21.4S // ...............................................................*........................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v28.4S, v16.4S, v28.4S // ................................................................*....................................................... - mul v16.4S, v15.4S, v25.4S // ........................................................................................*............................... + add v21.4S, v15.4S, v21.4S // ................................................................*....................................................... + mul v15.4S, v27.4S, v25.4S // ........................................................................................*............................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v15.4S, v15.4S, v26.4S // .........................................................................................*.............................. + sqrdmulh v27.4S, v27.4S, v26.4S // .........................................................................................*.............................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v21.4S, v19.4S, v8.S[0] // ....................................................*................................................................... + mls v19.4S, v11.4S, v8.S[0] // ....................................................*................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v19.4S, v11.4S, v0.S[1] // ........................................................*............................................................... + sqrdmulh v11.4S, v4.4S, v0.S[1] // ........................................................*............................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v11.4S, v12.4S, v0.S[0] // ............................................................*........................................................... + mul v4.4S, v24.4S, v0.S[0] // ............................................................*........................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v24.4S, v31.4S, v21.4S // ....................................................................*................................................... + cmge v29.4S, v31.4S, v19.4S // ....................................................................*................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v12.4S, v12.4S, v0.S[1] // .............................................................*.......................................................... - cmge v7.4S, v21.4S, v30.4S // .....................................................................*.................................................. + sqrdmulh v24.4S, v24.4S, v0.S[1] // .............................................................*.......................................................... + cmge v6.4S, v19.4S, v30.4S // .....................................................................*.................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v5.4S, v19.4S, v8.S[0] // .........................................................*.............................................................. + mls v7.4S, v11.4S, v8.S[0] // .........................................................*.............................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v19.4S, v24.4S, v7.4S // ......................................................................*................................................. + sub v11.4S, v29.4S, v6.4S // ......................................................................*................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v24.4S, v27.4S, v0.S[0] // .................................................................*...................................................... + mul v29.4S, v17.4S, v0.S[0] // .................................................................*...................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v11.4S, v12.4S, v8.S[0] // ..............................................................*......................................................... + mls v4.4S, v24.4S, v8.S[0] // ..............................................................*......................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v12.4S, v31.4S, v5.4S // ........................................................................*............................................... + cmge v24.4S, v31.4S, v7.4S // ........................................................................*............................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v27.4S, v27.4S, v0.S[1] // ..................................................................*..................................................... - cmge v7.4S, v5.4S, v30.4S // .........................................................................*.............................................. + sqrdmulh v17.4S, v17.4S, v0.S[1] // ..................................................................*..................................................... + cmge v6.4S, v7.4S, v30.4S // .........................................................................*.............................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v21.4S, v19.4S, v29.4S // .......................................................................*................................................ + mls v19.4S, v11.4S, v8.4S // .......................................................................*................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v19.4S, v12.4S, v7.4S // ..........................................................................*............................................. + sub v11.4S, v24.4S, v6.4S // ..........................................................................*............................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v16.4S, v15.4S, v8.S[0] // ..........................................................................................*............................. - cmge v12.4S, v31.4S, v11.4S // ............................................................................*........................................... + mls v15.4S, v27.4S, v8.S[0] // ..........................................................................................*............................. + cmge v27.4S, v31.4S, v4.4S // ............................................................................*........................................... // gap // ........................................................................................................................ - cmge v15.4S, v11.4S, v30.4S // .............................................................................*.......................................... + cmge v24.4S, v4.4S, v30.4S // .............................................................................*.......................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v24.4S, v27.4S, v8.S[0] // ...................................................................*.................................................... + mls v29.4S, v17.4S, v8.S[0] // ...................................................................*.................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q21, [x0, #512] // ....................................................................................*................................... + str q19, [x0, #512] // ....................................................................................*................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v5.4S, v19.4S, v29.4S // ...........................................................................*............................................ - sub v19.4S, v12.4S, v15.4S // ..............................................................................*......................................... + mls v7.4S, v11.4S, v8.4S // ...........................................................................*............................................ + sub v27.4S, v27.4S, v24.4S // ..............................................................................*......................................... // gap // ........................................................................................................................ - cmge v12.4S, v31.4S, v16.4S // ....................................................................................................*................... + cmge v11.4S, v31.4S, v15.4S // ....................................................................................................*................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v15.4S, v16.4S, v30.4S // .....................................................................................................*.................. - mul v27.4S, v9.4S, v25.4S // ...........................................................................................*............................ + cmge v24.4S, v15.4S, v30.4S // .....................................................................................................*.................. + mul v17.4S, v23.4S, v25.4S // ...........................................................................................*............................ // gap // ........................................................................................................................ - cmge v21.4S, v31.4S, v24.4S // ................................................................................*....................................... + cmge v19.4S, v31.4S, v29.4S // ................................................................................*....................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v11.4S, v19.4S, v29.4S // ...............................................................................*........................................ - cmge v19.4S, v24.4S, v30.4S // .................................................................................*...................................... + mls v4.4S, v27.4S, v8.4S // ...............................................................................*........................................ + cmge v27.4S, v29.4S, v30.4S // .................................................................................*...................................... // gap // ........................................................................................................................ - str q5, [x0, #640] // .....................................................................................*.................................. - sub v5.4S, v12.4S, v15.4S // ......................................................................................................*................. + str q7, [x0, #640] // .....................................................................................*.................................. + sub v11.4S, v11.4S, v24.4S // ......................................................................................................*................. // gap // ........................................................................................................................ - sqrdmulh v9.4S, v9.4S, v26.4S // ............................................................................................*........................... + sqrdmulh v7.4S, v23.4S, v26.4S // ............................................................................................*........................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v19.4S, v21.4S, v19.4S // ..................................................................................*..................................... + sub v27.4S, v19.4S, v27.4S // ..................................................................................*..................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v12.4S, v18.4S, v25.4S // ..............................................................................................*......................... + mul v23.4S, v9.4S, v25.4S // ..............................................................................................*......................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q11, [x0, #768] // ......................................................................................*................................. + str q4, [x0, #768] // ......................................................................................*................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v15.4S, v18.4S, v26.4S // ...............................................................................................*........................ + sqrdmulh v24.4S, v9.4S, v26.4S // ...............................................................................................*........................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v27.4S, v9.4S, v8.S[0] // .............................................................................................*.......................... + mls v17.4S, v7.4S, v8.S[0] // .............................................................................................*.......................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v9.4S, v28.4S, v26.4S // ..................................................................................................*..................... + sqrdmulh v7.4S, v21.4S, v26.4S // ..................................................................................................*..................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v12.4S, v15.4S, v8.S[0] // ................................................................................................*....................... + mls v23.4S, v24.4S, v8.S[0] // ................................................................................................*....................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v15.4S, v31.4S, v27.4S // ........................................................................................................*............... + cmge v24.4S, v31.4S, v17.4S // ........................................................................................................*............... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v18.4S, v28.4S, v25.4S // .................................................................................................*...................... - cmge v28.4S, v27.4S, v30.4S // .........................................................................................................*.............. + mul v9.4S, v21.4S, v25.4S // .................................................................................................*...................... + cmge v21.4S, v17.4S, v30.4S // .........................................................................................................*.............. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v18.4S, v9.4S, v8.S[0] // ...................................................................................................*.................... + mls v9.4S, v7.4S, v8.S[0] // ...................................................................................................*.................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v9.4S, v15.4S, v28.4S // ..........................................................................................................*............. + sub v7.4S, v24.4S, v21.4S // ..........................................................................................................*............. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v24.4S, v19.4S, v29.4S // ...................................................................................*.................................... - cmge v19.4S, v31.4S, v12.4S // ............................................................................................................*........... + mls v29.4S, v27.4S, v8.4S // ...................................................................................*.................................... + cmge v27.4S, v31.4S, v23.4S // ............................................................................................................*........... // gap // ........................................................................................................................ - cmge v15.4S, v12.4S, v30.4S // .............................................................................................................*.......... + cmge v24.4S, v23.4S, v30.4S // .............................................................................................................*.......... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v16.4S, v5.4S, v29.4S // .......................................................................................................*................ + mls v15.4S, v11.4S, v8.4S // .......................................................................................................*................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v5.4S, v31.4S, v18.4S // ................................................................................................................*....... + cmge v11.4S, v31.4S, v9.4S // ................................................................................................................*....... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v27.4S, v9.4S, v29.4S // ...........................................................................................................*............ - cmge v9.4S, v18.4S, v30.4S // .................................................................................................................*...... + mls v17.4S, v7.4S, v8.4S // ...........................................................................................................*............ + cmge v7.4S, v9.4S, v30.4S // .................................................................................................................*...... // gap // ........................................................................................................................ - str q24, [x0, #896] // .......................................................................................*................................ - sub v19.4S, v19.4S, v15.4S // ..............................................................................................................*......... + str q29, [x0, #896] // .......................................................................................*................................ + sub v27.4S, v27.4S, v24.4S // ..............................................................................................................*......... // gap // ........................................................................................................................ - sqrdmulh v15.4S, v6.4S, v3.S[1] // ..........................e............................................................................................. + sqrdmulh v24.4S, v22.4S, v3.S[1] // ..........................e............................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q16, [x0], #(16) // ....................................................................................................................*... - sub v5.4S, v5.4S, v9.4S // ..................................................................................................................*..... + str q15, [x0], #(16) // ....................................................................................................................*... + sub v11.4S, v11.4S, v7.4S // ..................................................................................................................*..... // gap // ........................................................................................................................ - mls v12.4S, v19.4S, v29.4S // ...............................................................................................................*........ + mls v23.4S, v27.4S, v8.4S // ...............................................................................................................*........ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q27, [x0, #112] // .....................................................................................................................*.. + str q17, [x0, #112] // .....................................................................................................................*.. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v18.4S, v5.4S, v29.4S // ...................................................................................................................*.... + mls v9.4S, v11.4S, v8.4S // ...................................................................................................................*.... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v21.4S, v6.4S, v3.S[0] // .........................e.............................................................................................. + mul v19.4S, v22.4S, v3.S[0] // .........................e.............................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q12, [x0, #240] // ......................................................................................................................*. + str q23, [x0, #240] // ......................................................................................................................*. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v11.4S, v20.4S, v2.S[3] // .....................e.................................................................................................. + sqrdmulh v4.4S, v5.4S, v2.S[3] // .....................e.................................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q18, [x0, #368] // .......................................................................................................................* + str q9, [x0, #368] // .......................................................................................................................* // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v21.4S, v15.4S, v8.S[0] // ...........................e............................................................................................ + mls v19.4S, v24.4S, v8.S[0] // ...........................e............................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - // original source code + // ------------------------------------------------------------------------------------------------------------- new position --------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|-------- // ldr q9, [x0, #0] // ...................................................................................................................*...................................................................................................................... - // ldr q10, [x0, #(1*(1024/8))] // ...................................................................................................................|*..................................................................................................................... - // ldr q11, [x0, #(2*(1024/8))] // e..................................................................................................................|....e................................................................................................................. - // ldr q12, [x0, #(3*(1024/8))] // ...e...............................................................................................................|.......e.............................................................................................................. - // ldr q13, [x0, #(4*(1024/8))] // .....e.............................................................................................................|.........e............................................................................................................ - // ldr q14, [x0, #(5*(1024/8))] // ......e............................................................................................................|..........e........................................................................................................... - // ldr q15, [x0, #(6*(1024/8))] // ..e................................................................................................................|......e............................................................................................................... - // ldr q16, [x0, #(7*(1024/8))] // ........e..........................................................................................................|............e......................................................................................................... - // sub v24.4s, v9.4s, v10.4s // .........*.........................................................................................................|.............*........................................................................................................ - // add v9.4s, v9.4s, v10.4s // .......*...........................................................................................................|...........*.......................................................................................................... - // mul v10.4s, v24.4s, v1.s[2] // ................*..................................................................................................|....................*................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................*.............................................................................................|.........................*............................................................................................ - // mls v10.4s, v24.4s, v8.s[0] // ..............................*....................................................................................|..................................*................................................................................... - // sub v24.4s, v11.4s, v12.4s // ...................................................................................................................|.*.................................................................................................................... - // add v11.4s, v11.4s, v12.4s // ...................................................................................................................|..*................................................................................................................... - // mul v12.4s, v24.4s, v2.s[0] // ....*..............................................................................................................|........*............................................................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........*........................................................................................................|..............*....................................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ...................*...............................................................................................|.......................*.............................................................................................. - // sub v24.4s, v13.4s, v14.4s // ..........................e........................................................................................|..............................e....................................................................................... - // add v13.4s, v13.4s, v14.4s // .*.................................................................................................................|.....*................................................................................................................ - // mul v14.4s, v24.4s, v2.s[2] // ...................................................................................................................|...*.................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................................................................................................................e..|....................................................................................................................e. - // mls v14.4s, v24.4s, v8.s[0] // ............*......................................................................................................|................*..................................................................................................... - // sub v24.4s, v15.4s, v16.4s // .................e.................................................................................................|.....................e................................................................................................ - // add v15.4s, v15.4s, v16.4s // ..................e................................................................................................|......................e............................................................................................... - // mul v16.4s, v24.4s, v3.s[0] // ..............................................................................................................e....|..................................................................................................................e... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................................................................................e..........|............................................................................................................e......... - // mls v16.4s, v24.4s, v8.s[0] // ..................................................................................................................e|...................................................................................................................... - // sub v24.4s, v9.4s, v11.4s // .............*.....................................................................................................|.................*.................................................................................................... - // add v9.4s, v9.4s, v11.4s // ..............*....................................................................................................|..................*................................................................................................... - // mul v11.4s, v24.4s, v0.s[2] // .........................*.........................................................................................|.............................*........................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................*.......................................................................................|...............................*...................................................................................... - // mls v11.4s, v24.4s, v8.s[0] // ...............................*...................................................................................|...................................*.................................................................................. - // sub v24.4s, v10.4s, v12.4s // .................................*.................................................................................|.....................................*................................................................................ - // add v10.4s, v10.4s, v12.4s // ..................................*................................................................................|......................................*............................................................................... - // mul v12.4s, v24.4s, v0.s[2] // ....................................*..............................................................................|........................................*............................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................*...........................................................................|...........................................*.......................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ............................................*......................................................................|................................................*..................................................................... - // sub v24.4s, v13.4s, v15.4s // ...........*.......................................................................................................|...............*...................................................................................................... - // add v13.4s, v13.4s, v15.4s // ...............*...................................................................................................|...................*.................................................................................................. - // mul v15.4s, v24.4s, v1.s[0] // ............................*......................................................................................|................................*..................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................*.....................................................................................|.................................*.................................................................................... - // mls v15.4s, v24.4s, v8.s[0] // ................................*..................................................................................|....................................*................................................................................. - // sub v24.4s, v14.4s, v16.4s // ....................*..............................................................................................|........................*............................................................................................. - // add v14.4s, v14.4s, v16.4s // ......................*............................................................................................|..........................*........................................................................................... - // mul v16.4s, v24.4s, v1.s[0] // ...................................*...............................................................................|.......................................*.............................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................*.........................................................................|.............................................*........................................................................ - // mls v16.4s, v24.4s, v8.s[0] // .............................................*.....................................................................|.................................................*.................................................................... - // sub v24.4s, v9.4s, v13.4s // .......................*...........................................................................................|...........................*.......................................................................................... - // add v9.4s, v9.4s, v13.4s // ........................*..........................................................................................|............................*......................................................................................... - // mul v13.4s, v24.4s, v0.s[0] // ...........................................*.......................................................................|...............................................*...................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................*....................................................................|..................................................*................................................................... - // mls v13.4s, v24.4s, v8.s[0] // ....................................................*..............................................................|........................................................*............................................................. - // sub v24.4s, v10.4s, v14.4s // .....................................*.............................................................................|.........................................*............................................................................ - // add v10.4s, v10.4s, v14.4s // ......................................*............................................................................|..........................................*........................................................................... - // mul v14.4s, v24.4s, v0.s[0] // ...............................................*...................................................................|...................................................*.................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................*.............................................................|.........................................................*............................................................ - // mls v14.4s, v24.4s, v8.s[0] // ..........................................................*........................................................|..............................................................*....................................................... - // sub v24.4s, v11.4s, v15.4s // ........................................*..........................................................................|............................................*......................................................................... - // add v11.4s, v11.4s, v15.4s // ..........................................*........................................................................|..............................................*....................................................................... - // mul v15.4s, v24.4s, v0.s[0] // ......................................................*............................................................|..........................................................*........................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................*..........................................................|............................................................*......................................................... - // mls v15.4s, v24.4s, v8.s[0] // .............................................................*.....................................................|.................................................................*.................................................... - // sub v24.4s, v12.4s, v16.4s // ................................................*..................................................................|....................................................*................................................................. - // add v12.4s, v12.4s, v16.4s // .................................................*.................................................................|.....................................................*................................................................ - // mul v16.4s, v24.4s, v0.s[0] // ............................................................*......................................................|................................................................*..................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................*...................................................|...................................................................*.................................................. - // mls v16.4s, v24.4s, v8.s[0] // ......................................................................*............................................|..........................................................................*........................................... - // cmge v27.4s, v31.4s, v13.4s // .......................................................*...........................................................|...........................................................*.......................................................... - // cmge v28.4s, v13.4s, v30.4s // .........................................................*.........................................................|.............................................................*........................................................ - // sub v28.4s, v27.4s, v28.4s // ...........................................................*.......................................................|...............................................................*...................................................... - // mls v13.4s, v28.4s, v29.4s // .................................................................*.................................................|.....................................................................*................................................ - // cmge v27.4s, v31.4s, v14.4s // ..............................................................*....................................................|..................................................................*................................................... - // cmge v28.4s, v14.4s, v30.4s // ................................................................*..................................................|....................................................................*................................................. - // sub v28.4s, v27.4s, v28.4s // ..................................................................*................................................|......................................................................*............................................... - // mls v14.4s, v28.4s, v29.4s // ........................................................................*..........................................|............................................................................*......................................... - // cmge v27.4s, v31.4s, v15.4s // ....................................................................*..............................................|........................................................................*............................................. - // cmge v28.4s, v15.4s, v30.4s // .....................................................................*.............................................|.........................................................................*............................................ - // sub v28.4s, v27.4s, v28.4s // .........................................................................*.........................................|.............................................................................*........................................ - // mls v15.4s, v28.4s, v29.4s // ..............................................................................*....................................|..................................................................................*................................... - // cmge v27.4s, v31.4s, v16.4s // .............................................................................*.....................................|.................................................................................*.................................... - // cmge v28.4s, v16.4s, v30.4s // ...............................................................................*...................................|...................................................................................*.................................. - // sub v28.4s, v27.4s, v28.4s // ...................................................................................*...............................|.......................................................................................*.............................. - // mls v16.4s, v28.4s, v29.4s // ...............................................................................................*...................|...................................................................................................*.................. - // str q13, [x0, #(4*(1024/8))] // .......................................................................*...........................................|...........................................................................*.......................................... - // str q14, [x0, #(5*(1024/8))] // ................................................................................*..................................|....................................................................................*................................. - // str q15, [x0, #(6*(1024/8))] // .....................................................................................*.............................|.........................................................................................*............................ - // str q16, [x0, #(7*(1024/8))] // ......................................................................................................*............|..........................................................................................................*........... - // mul v13.4s, v9.4s, v25.4s // ..................................................*................................................................|......................................................*............................................................... - // sqrdmulh v9.4s, v9.4s, v26.4s // ...................................................*...............................................................|.......................................................*.............................................................. - // mls v13.4s, v9.4s, v8.s[0] // ...................................................................*...............................................|.......................................................................*.............................................. - // mul v14.4s, v10.4s, v25.4s // ............................................................................*......................................|................................................................................*..................................... - // sqrdmulh v10.4s, v10.4s, v26.4s // ..................................................................................*................................|......................................................................................*............................... - // mls v14.4s, v10.4s, v8.s[0] // .......................................................................................*...........................|...........................................................................................*.......................... - // mul v15.4s, v11.4s, v25.4s // ....................................................................................*..............................|........................................................................................*............................. - // sqrdmulh v11.4s, v11.4s, v26.4s // ......................................................................................*............................|..........................................................................................*........................... - // mls v15.4s, v11.4s, v8.s[0] // .........................................................................................*.........................|.............................................................................................*........................ - // mul v16.4s, v12.4s, v25.4s // ...........................................................................................*.......................|...............................................................................................*...................... - // sqrdmulh v12.4s, v12.4s, v26.4s // ........................................................................................*..........................|............................................................................................*......................... - // mls v16.4s, v12.4s, v8.s[0] // .............................................................................................*.....................|.................................................................................................*.................... - // cmge v27.4s, v31.4s, v13.4s // ..........................................................................*........................................|..............................................................................*....................................... - // cmge v28.4s, v13.4s, v30.4s // ...........................................................................*.......................................|...............................................................................*...................................... - // sub v28.4s, v27.4s, v28.4s // .................................................................................*.................................|.....................................................................................*................................ - // mls v13.4s, v28.4s, v29.4s // ..................................................................................................*................|......................................................................................................*............... - // cmge v27.4s, v31.4s, v14.4s // ..........................................................................................*........................|..............................................................................................*....................... - // cmge v28.4s, v14.4s, v30.4s // ............................................................................................*......................|................................................................................................*..................... - // sub v28.4s, v27.4s, v28.4s // ..............................................................................................*....................|..................................................................................................*................... - // mls v14.4s, v28.4s, v29.4s // ....................................................................................................*..............|........................................................................................................*............. - // cmge v27.4s, v31.4s, v15.4s // ................................................................................................*..................|....................................................................................................*................. - // cmge v28.4s, v15.4s, v30.4s // .................................................................................................*.................|.....................................................................................................*................ - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................*...........|...........................................................................................................*.......... - // mls v15.4s, v28.4s, v29.4s // ...........................................................................................................*.......|...............................................................................................................*...... - // cmge v27.4s, v31.4s, v16.4s // ...................................................................................................*...............|.......................................................................................................*.............. - // cmge v28.4s, v16.4s, v30.4s // .....................................................................................................*.............|.........................................................................................................*............ - // sub v28.4s, v27.4s, v28.4s // ..........................................................................................................*........|..............................................................................................................*....... - // mls v16.4s, v28.4s, v29.4s // .............................................................................................................*.....|.................................................................................................................*.... - // str q13, [x0], #(16) // .........................................................................................................*.........|.............................................................................................................*........ - // str q14, [x0, #(-16 + 1*(1024/8))] // ............................................................................................................*......|................................................................................................................*..... - // str q15, [x0, #(-16 + 2*(1024/8))] // ...............................................................................................................*...|...................................................................................................................*.. - // str q16, [x0, #(-16 + 3*(1024/8))] // .................................................................................................................*.|.....................................................................................................................* + // ldr q10, [x0, #(1*(1024/8))] // ...................................................................................................................'*..................................................................................................................... + // ldr q11, [x0, #(2*(1024/8))] // ..e................................................................................................................'......~............................................................................................................... + // ldr q12, [x0, #(3*(1024/8))] // ...e...............................................................................................................'.......~.............................................................................................................. + // ldr q13, [x0, #(4*(1024/8))] // .....e.............................................................................................................'.........~............................................................................................................ + // ldr q14, [x0, #(5*(1024/8))] // ......e............................................................................................................'..........~........................................................................................................... + // ldr q15, [x0, #(6*(1024/8))] // e..................................................................................................................'....~................................................................................................................. + // ldr q16, [x0, #(7*(1024/8))] // .......e...........................................................................................................'...........~.......................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ........~..........................................................................................................'............*......................................................................................................... + // add v9.4s, v9.4s, v10.4s // .........~.........................................................................................................'.............*........................................................................................................ + // mul v10.4s, v24.4s, v1.s[2] // ................~..................................................................................................'....................*................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................~.............................................................................................'.........................*............................................................................................ + // mls v10.4s, v24.4s, v8.s[0] // ..............................~....................................................................................'..................................*................................................................................... + // sub v24.4s, v11.4s, v12.4s // ...................................................................................................................'.*.................................................................................................................... + // add v11.4s, v11.4s, v12.4s // ...................................................................................................................'...*.................................................................................................................. + // mul v12.4s, v24.4s, v2.s[0] // ....~..............................................................................................................'........*............................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........~........................................................................................................'..............*....................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................~...............................................................................................'.......................*.............................................................................................. + // sub v24.4s, v13.4s, v14.4s // ..........................e........................................................................................'..............................~....................................................................................... + // add v13.4s, v13.4s, v14.4s // .~.................................................................................................................'.....*................................................................................................................ + // mul v14.4s, v24.4s, v2.s[2] // ...................................................................................................................'..*................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................................................................................................................e..'....................................................................................................................~. + // mls v14.4s, v24.4s, v8.s[0] // ............~......................................................................................................'................*..................................................................................................... + // sub v24.4s, v15.4s, v16.4s // .................e.................................................................................................'.....................~................................................................................................ + // add v15.4s, v15.4s, v16.4s // ..................e................................................................................................'......................~............................................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ..............................................................................................................e....'..................................................................................................................~... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................................................................................e..........'............................................................................................................~......... + // mls v16.4s, v24.4s, v8.s[0] // ..................................................................................................................e'...................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ..............~....................................................................................................'..................*................................................................................................... + // add v9.4s, v9.4s, v11.4s // ...............~...................................................................................................'...................*.................................................................................................. + // mul v11.4s, v24.4s, v0.s[2] // .........................~.........................................................................................'.............................*........................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................~.......................................................................................'...............................*...................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ...............................~...................................................................................'...................................*.................................................................................. + // sub v24.4s, v10.4s, v12.4s // .................................~.................................................................................'.....................................*................................................................................ + // add v10.4s, v10.4s, v12.4s // ..................................~................................................................................'......................................*............................................................................... + // mul v12.4s, v24.4s, v0.s[2] // ....................................~..............................................................................'........................................*............................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................~...........................................................................'...........................................*.......................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ............................................~......................................................................'................................................*..................................................................... + // sub v24.4s, v13.4s, v15.4s // ...........~.......................................................................................................'...............*...................................................................................................... + // add v13.4s, v13.4s, v15.4s // .............~.....................................................................................................'.................*.................................................................................................... + // mul v15.4s, v24.4s, v1.s[0] // ............................~......................................................................................'................................*..................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................~.....................................................................................'.................................*.................................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ................................~..................................................................................'....................................*................................................................................. + // sub v24.4s, v14.4s, v16.4s // ....................~..............................................................................................'........................*............................................................................................. + // add v14.4s, v14.4s, v16.4s // ......................~............................................................................................'..........................*........................................................................................... + // mul v16.4s, v24.4s, v1.s[0] // ...................................~...............................................................................'.......................................*.............................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................~.........................................................................'.............................................*........................................................................ + // mls v16.4s, v24.4s, v8.s[0] // .............................................~.....................................................................'.................................................*.................................................................... + // sub v24.4s, v9.4s, v13.4s // .......................~...........................................................................................'...........................*.......................................................................................... + // add v9.4s, v9.4s, v13.4s // ........................~..........................................................................................'............................*......................................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ...........................................~.......................................................................'...............................................*...................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................~....................................................................'..................................................*................................................................... + // mls v13.4s, v24.4s, v8.s[0] // ....................................................~..............................................................'........................................................*............................................................. + // sub v24.4s, v10.4s, v14.4s // .....................................~.............................................................................'.........................................*............................................................................ + // add v10.4s, v10.4s, v14.4s // ......................................~............................................................................'..........................................*........................................................................... + // mul v14.4s, v24.4s, v0.s[0] // ...............................................~...................................................................'...................................................*.................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................~.............................................................'.........................................................*............................................................ + // mls v14.4s, v24.4s, v8.s[0] // ..........................................................~........................................................'..............................................................*....................................................... + // sub v24.4s, v11.4s, v15.4s // ........................................~..........................................................................'............................................*......................................................................... + // add v11.4s, v11.4s, v15.4s // ..........................................~........................................................................'..............................................*....................................................................... + // mul v15.4s, v24.4s, v0.s[0] // ......................................................~............................................................'..........................................................*........................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................~..........................................................'............................................................*......................................................... + // mls v15.4s, v24.4s, v8.s[0] // .............................................................~.....................................................'.................................................................*.................................................... + // sub v24.4s, v12.4s, v16.4s // ................................................~..................................................................'....................................................*................................................................. + // add v12.4s, v12.4s, v16.4s // .................................................~.................................................................'.....................................................*................................................................ + // mul v16.4s, v24.4s, v0.s[0] // ............................................................~......................................................'................................................................*..................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................~...................................................'...................................................................*.................................................. + // mls v16.4s, v24.4s, v8.s[0] // ......................................................................~............................................'..........................................................................*........................................... + // cmge v27.4s, v31.4s, v13.4s // .......................................................~...........................................................'...........................................................*.......................................................... + // cmge v28.4s, v13.4s, v30.4s // .........................................................~.........................................................'.............................................................*........................................................ + // sub v28.4s, v27.4s, v28.4s // ...........................................................~.......................................................'...............................................................*...................................................... + // mls v13.4s, v28.4s, v8.4s // .................................................................~.................................................'.....................................................................*................................................ + // cmge v27.4s, v31.4s, v14.4s // ..............................................................~....................................................'..................................................................*................................................... + // cmge v28.4s, v14.4s, v30.4s // ................................................................~..................................................'....................................................................*................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................~................................................'......................................................................*............................................... + // mls v14.4s, v28.4s, v8.4s // ........................................................................~..........................................'............................................................................*......................................... + // cmge v27.4s, v31.4s, v15.4s // ....................................................................~..............................................'........................................................................*............................................. + // cmge v28.4s, v15.4s, v30.4s // .....................................................................~.............................................'.........................................................................*............................................ + // sub v28.4s, v27.4s, v28.4s // .........................................................................~.........................................'.............................................................................*........................................ + // mls v15.4s, v28.4s, v8.4s // ..............................................................................~....................................'..................................................................................*................................... + // cmge v27.4s, v31.4s, v16.4s // .............................................................................~.....................................'.................................................................................*.................................... + // cmge v28.4s, v16.4s, v30.4s // ...............................................................................~...................................'...................................................................................*.................................. + // sub v28.4s, v27.4s, v28.4s // ...................................................................................~...............................'.......................................................................................*.............................. + // mls v16.4s, v28.4s, v8.4s // ...............................................................................................~...................'...................................................................................................*.................. + // str q13, [x0, #(4*(1024/8))] // .......................................................................~...........................................'...........................................................................*.......................................... + // str q14, [x0, #(5*(1024/8))] // ................................................................................~..................................'....................................................................................*................................. + // str q15, [x0, #(6*(1024/8))] // .....................................................................................~.............................'.........................................................................................*............................ + // str q16, [x0, #(7*(1024/8))] // ......................................................................................................~............'..........................................................................................................*........... + // mul v13.4s, v9.4s, v25.4s // ..................................................~................................................................'......................................................*............................................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ...................................................~...............................................................'.......................................................*.............................................................. + // mls v13.4s, v9.4s, v8.s[0] // ...................................................................~...............................................'.......................................................................*.............................................. + // mul v14.4s, v10.4s, v25.4s // ............................................................................~......................................'................................................................................*..................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ..................................................................................~................................'......................................................................................*............................... + // mls v14.4s, v10.4s, v8.s[0] // .......................................................................................~...........................'...........................................................................................*.......................... + // mul v15.4s, v11.4s, v25.4s // ....................................................................................~..............................'........................................................................................*............................. + // sqrdmulh v11.4s, v11.4s, v26.4s // ......................................................................................~............................'..........................................................................................*........................... + // mls v15.4s, v11.4s, v8.s[0] // .........................................................................................~.........................'.............................................................................................*........................ + // mul v16.4s, v12.4s, v25.4s // ...........................................................................................~.......................'...............................................................................................*...................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ........................................................................................~..........................'............................................................................................*......................... + // mls v16.4s, v12.4s, v8.s[0] // .............................................................................................~.....................'.................................................................................................*.................... + // cmge v27.4s, v31.4s, v13.4s // ..........................................................................~........................................'..............................................................................*....................................... + // cmge v28.4s, v13.4s, v30.4s // ...........................................................................~.......................................'...............................................................................*...................................... + // sub v28.4s, v27.4s, v28.4s // .................................................................................~.................................'.....................................................................................*................................ + // mls v13.4s, v28.4s, v8.4s // ..................................................................................................~................'......................................................................................................*............... + // cmge v27.4s, v31.4s, v14.4s // ..........................................................................................~........................'..............................................................................................*....................... + // cmge v28.4s, v14.4s, v30.4s // ............................................................................................~......................'................................................................................................*..................... + // sub v28.4s, v27.4s, v28.4s // ..............................................................................................~....................'..................................................................................................*................... + // mls v14.4s, v28.4s, v8.4s // ....................................................................................................~..............'........................................................................................................*............. + // cmge v27.4s, v31.4s, v15.4s // ................................................................................................~..................'....................................................................................................*................. + // cmge v28.4s, v15.4s, v30.4s // .................................................................................................~.................'.....................................................................................................*................ + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................~...........'...........................................................................................................*.......... + // mls v15.4s, v28.4s, v8.4s // ...........................................................................................................~.......'...............................................................................................................*...... + // cmge v27.4s, v31.4s, v16.4s // ...................................................................................................~...............'.......................................................................................................*.............. + // cmge v28.4s, v16.4s, v30.4s // .....................................................................................................~.............'.........................................................................................................*............ + // sub v28.4s, v27.4s, v28.4s // ..........................................................................................................~........'..............................................................................................................*....... + // mls v16.4s, v28.4s, v8.4s // .............................................................................................................~.....'.................................................................................................................*.... + // str q13, [x0], #(16) // .........................................................................................................~.........'.............................................................................................................*........ + // str q14, [x0, #(-16 + 1*(1024/8))] // ............................................................................................................~......'................................................................................................................*..... + // str q15, [x0, #(-16 + 2*(1024/8))] // ...............................................................................................................~...'...................................................................................................................*.. + // str q16, [x0, #(-16 + 3*(1024/8))] // .................................................................................................................~.'.....................................................................................................................* sub count, count, #1 cbnz count, layer123_start - mul v19.4S, v20.4S, v2.S[2] // ....*...................................................................................................... - add v5.4S, v22.4S, v4.4S // .....*..................................................................................................... - ldr q9, [x0, #0] // *.......................................................................................................... - ldr q12, [x0, #128] // .*......................................................................................................... - sub v15.4S, v13.4S, v17.4S // ..*........................................................................................................ + // Instructions: 107 + // Expected cycles: 108 + // Expected IPC: 0.99 + // + // Wall time: 2.05s + // User time: 2.05s + // + // ------------------------------------------- original position --------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------ + mul v27.4S, v5.4S, v2.S[2] // ...*....................................................................................................... + add v11.4S, v18.4S, v16.4S // .....*..................................................................................................... + ldr q7, [x0, #0] // *.......................................................................................................... + ldr q23, [x0, #128] // .*......................................................................................................... + sub v24.4S, v13.4S, v20.4S // ..*........................................................................................................ // gap // ........................................................................................................... - add v13.4S, v13.4S, v17.4S // ...*....................................................................................................... - mls v19.4S, v11.4S, v8.S[0] // ...........*............................................................................................... + add v9.4S, v13.4S, v20.4S // ....*...................................................................................................... + mls v27.4S, v4.4S, v8.S[0] // ...........*............................................................................................... // gap // ........................................................................................................... - sub v18.4S, v5.4S, v23.4S // ..........*................................................................................................ + sub v13.4S, v11.4S, v14.4S // ..........*................................................................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... - add v5.4S, v5.4S, v23.4S // ..............*............................................................................................ - mul v27.4S, v15.4S, v2.S[0] // ......*.................................................................................................... + add v11.4S, v11.4S, v14.4S // ............*.............................................................................................. + mul v20.4S, v24.4S, v2.S[0] // ......*.................................................................................................... // gap // ........................................................................................................... - add v17.4S, v9.4S, v12.4S // .......*................................................................................................... + sub v21.4S, v7.4S, v23.4S // .......*................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v9.4S, v9.4S, v12.4S // ........*.................................................................................................. - sqrdmulh v12.4S, v15.4S, v2.S[1] // .........*................................................................................................. + add v7.4S, v7.4S, v23.4S // ........*.................................................................................................. + sqrdmulh v23.4S, v24.4S, v2.S[1] // .........*................................................................................................. // gap // ........................................................................................................... - sub v15.4S, v19.4S, v21.4S // .................*......................................................................................... + sub v24.4S, v27.4S, v19.4S // .................*......................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - add v19.4S, v19.4S, v21.4S // ...................*....................................................................................... - mul v28.4S, v18.4S, v1.S[0] // ........................*.................................................................................. + add v27.4S, v27.4S, v19.4S // ...................*....................................................................................... + mul v15.4S, v21.4S, v1.S[2] // ...............*........................................................................................... // gap // ........................................................................................................... - sub v22.4S, v17.4S, v13.4S // ............*.............................................................................................. + sub v18.4S, v7.4S, v9.4S // .............*............................................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... - add v13.4S, v17.4S, v13.4S // .............*............................................................................................. - sqrdmulh v18.4S, v18.4S, v1.S[1] // .........................*................................................................................. + add v7.4S, v7.4S, v9.4S // ..............*............................................................................................ + mul v9.4S, v13.4S, v1.S[0] // ........................*.................................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v17.4S, v9.4S, v1.S[2] // ...............*........................................................................................... + sqrdmulh v13.4S, v13.4S, v1.S[1] // .........................*................................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v4.4S, v13.4S, v5.4S // ....................*...................................................................................... + sub v16.4S, v7.4S, v11.4S // ....................*...................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - add v5.4S, v13.4S, v5.4S // .....................*..................................................................................... - mls v27.4S, v12.4S, v8.S[0] // ................*.......................................................................................... + add v11.4S, v7.4S, v11.4S // .....................*..................................................................................... + mls v20.4S, v23.4S, v8.S[0] // ................*.......................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v9.4S, v9.4S, v1.S[3] // ..................*........................................................................................ + sqrdmulh v7.4S, v21.4S, v1.S[3] // ..................*........................................................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v12.4S, v22.4S, v0.S[2] // ......................*.................................................................................... + mul v23.4S, v18.4S, v0.S[2] // ......................*.................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v13.4S, v22.4S, v0.S[3] // .......................*................................................................................... + sqrdmulh v21.4S, v18.4S, v0.S[3] // .......................*................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v22.4S, v15.4S, v1.S[0] // ...............................*........................................................................... + mul v18.4S, v24.4S, v1.S[0] // ...............................*........................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v15.4S, v15.4S, v1.S[1] // .....................................*..................................................................... + sqrdmulh v24.4S, v24.4S, v1.S[1] // .....................................*..................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v17.4S, v9.4S, v8.S[0] // ..........................*................................................................................ + mls v15.4S, v7.4S, v8.S[0] // ..........................*................................................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v28.4S, v18.4S, v8.S[0] // ............................*.............................................................................. + mls v23.4S, v21.4S, v8.S[0] // ...........................*............................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v12.4S, v13.4S, v8.S[0] // ...........................*............................................................................... + mls v9.4S, v13.4S, v8.S[0] // ............................*.............................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... - add v9.4S, v17.4S, v27.4S // ..............................*............................................................................ + add v7.4S, v15.4S, v20.4S // ..............................*............................................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v13.4S, v17.4S, v27.4S // .............................*............................................................................. - mul v18.4S, v4.4S, v0.S[0] // .......................................*................................................................... + sub v13.4S, v15.4S, v20.4S // .............................*............................................................................. + mul v20.4S, v16.4S, v0.S[0] // .......................................*................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v22.4S, v15.4S, v8.S[0] // .........................................*................................................................. - sub v15.4S, v9.4S, v19.4S // .................................*......................................................................... + mls v18.4S, v24.4S, v8.S[0] // .........................................*................................................................. + sub v24.4S, v7.4S, v27.4S // .................................*......................................................................... // gap // ........................................................................................................... - add v19.4S, v9.4S, v19.4S // ..................................*........................................................................ + add v27.4S, v7.4S, v27.4S // ..................................*........................................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v9.4S, v13.4S, v0.S[2] // ................................*.......................................................................... - sub v27.4S, v12.4S, v28.4S // ....................................*...................................................................... + mul v7.4S, v13.4S, v0.S[2] // ................................*.......................................................................... + sub v21.4S, v23.4S, v9.4S // ....................................*...................................................................... // gap // ........................................................................................................... - add v12.4S, v12.4S, v28.4S // ......................................*.................................................................... + add v23.4S, v23.4S, v9.4S // ......................................*.................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v13.4S, v13.4S, v0.S[3] // ...................................*....................................................................... + sqrdmulh v9.4S, v13.4S, v0.S[3] // ...................................*....................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v17.4S, v4.4S, v0.S[1] // ..........................................*................................................................ + sqrdmulh v13.4S, v16.4S, v0.S[1] // ..........................................*................................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v28.4S, v15.4S, v0.S[0] // ...........................................*............................................................... + mul v15.4S, v24.4S, v0.S[0] // ...........................................*............................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v4.4S, v5.4S, v25.4S // ..............................................*............................................................ + mul v16.4S, v11.4S, v25.4S // ..............................................*............................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v5.4S, v5.4S, v26.4S // ...............................................*........................................................... + sqrdmulh v11.4S, v11.4S, v26.4S // ...............................................*........................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v9.4S, v13.4S, v8.S[0] // ........................................*.................................................................. + mls v7.4S, v9.4S, v8.S[0] // ........................................*.................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v18.4S, v17.4S, v8.S[0] // ................................................*.......................................................... + mls v20.4S, v13.4S, v8.S[0] // ................................................*.......................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v15.4S, v15.4S, v0.S[1] // .................................................*......................................................... + sqrdmulh v24.4S, v24.4S, v0.S[1] // .................................................*......................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v13.4S, v9.4S, v22.4S // ............................................*.............................................................. + sub v9.4S, v7.4S, v18.4S // ............................................*.............................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... - add v9.4S, v9.4S, v22.4S // .............................................*............................................................. - mul v17.4S, v27.4S, v0.S[0] // ..................................................*........................................................ + add v7.4S, v7.4S, v18.4S // .............................................*............................................................. + mul v13.4S, v21.4S, v0.S[0] // ..................................................*........................................................ // gap // ........................................................................................................... - cmge v22.4S, v31.4S, v18.4S // ...................................................*....................................................... + cmge v18.4S, v31.4S, v20.4S // ...................................................*....................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v27.4S, v27.4S, v0.S[1] // ....................................................*...................................................... - cmge v16.4S, v18.4S, v30.4S // .....................................................*..................................................... + sqrdmulh v21.4S, v21.4S, v0.S[1] // ....................................................*...................................................... + cmge v17.4S, v20.4S, v30.4S // .....................................................*..................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v28.4S, v15.4S, v8.S[0] // ......................................................*.................................................... + mls v15.4S, v24.4S, v8.S[0] // ......................................................*.................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v15.4S, v22.4S, v16.4S // .......................................................*................................................... + sub v24.4S, v18.4S, v17.4S // .......................................................*................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v22.4S, v13.4S, v0.S[0] // ........................................................*.................................................. + mul v18.4S, v9.4S, v0.S[0] // ........................................................*.................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v17.4S, v27.4S, v8.S[0] // .........................................................*................................................. + mls v13.4S, v21.4S, v8.S[0] // .........................................................*................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... - cmge v27.4S, v31.4S, v28.4S // ..........................................................*................................................ + cmge v21.4S, v31.4S, v15.4S // ..........................................................*................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v13.4S, v13.4S, v0.S[1] // ...........................................................*............................................... - cmge v16.4S, v28.4S, v30.4S // ............................................................*.............................................. + sqrdmulh v9.4S, v9.4S, v0.S[1] // ...........................................................*............................................... + cmge v17.4S, v15.4S, v30.4S // ............................................................*.............................................. // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v18.4S, v15.4S, v29.4S // .............................................................*............................................. + mls v20.4S, v24.4S, v8.4S // .............................................................*............................................. // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v15.4S, v27.4S, v16.4S // ..............................................................*............................................ + sub v24.4S, v21.4S, v17.4S // ..............................................................*............................................ // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v4.4S, v5.4S, v8.S[0] // ...............................................................*........................................... - cmge v5.4S, v31.4S, v17.4S // ................................................................*.......................................... + mls v16.4S, v11.4S, v8.S[0] // ...............................................................*........................................... + cmge v11.4S, v31.4S, v13.4S // ................................................................*.......................................... // gap // ........................................................................................................... - cmge v27.4S, v17.4S, v30.4S // .................................................................*......................................... + cmge v21.4S, v13.4S, v30.4S // .................................................................*......................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v22.4S, v13.4S, v8.S[0] // ..................................................................*........................................ + mls v18.4S, v9.4S, v8.S[0] // ..................................................................*........................................ // gap // ........................................................................................................... // gap // ........................................................................................................... - str q18, [x0, #512] // ...................................................................*....................................... + str q20, [x0, #512] // ...................................................................*....................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v28.4S, v15.4S, v29.4S // ....................................................................*...................................... - sub v5.4S, v5.4S, v27.4S // .....................................................................*..................................... + mls v15.4S, v24.4S, v8.4S // ....................................................................*...................................... + sub v11.4S, v11.4S, v21.4S // .....................................................................*..................................... // gap // ........................................................................................................... - cmge v15.4S, v31.4S, v4.4S // ......................................................................*.................................... + cmge v24.4S, v31.4S, v16.4S // ......................................................................*.................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - cmge v13.4S, v4.4S, v30.4S // .......................................................................*................................... - mul v18.4S, v19.4S, v25.4S // ........................................................................*.................................. + cmge v9.4S, v16.4S, v30.4S // .......................................................................*................................... + mul v20.4S, v27.4S, v25.4S // ........................................................................*.................................. // gap // ........................................................................................................... - cmge v27.4S, v31.4S, v22.4S // .........................................................................*................................. + cmge v21.4S, v31.4S, v18.4S // .........................................................................*................................. // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v19.4S, v19.4S, v26.4S // ..............................................................................*............................ - cmge v16.4S, v22.4S, v30.4S // ...........................................................................*............................... + sqrdmulh v27.4S, v27.4S, v26.4S // ..............................................................................*............................ + cmge v17.4S, v18.4S, v30.4S // ...........................................................................*............................... // gap // ........................................................................................................... - str q28, [x0, #640] // ............................................................................*.............................. - sub v15.4S, v15.4S, v13.4S // .............................................................................*............................. + str q15, [x0, #640] // ............................................................................*.............................. + sub v24.4S, v24.4S, v9.4S // .............................................................................*............................. // gap // ........................................................................................................... - mul v13.4S, v12.4S, v25.4S // ................................................................................*.......................... + mul v9.4S, v23.4S, v25.4S // ................................................................................*.......................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v27.4S, v27.4S, v16.4S // ...............................................................................*........................... + sub v21.4S, v21.4S, v17.4S // ...............................................................................*........................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v12.4S, v12.4S, v26.4S // ..................................................................................*........................ + sqrdmulh v23.4S, v23.4S, v26.4S // ..................................................................................*........................ // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v18.4S, v19.4S, v8.S[0] // ...................................................................................*....................... + mls v20.4S, v27.4S, v8.S[0] // ...................................................................................*....................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v19.4S, v9.4S, v26.4S // ....................................................................................*...................... + sqrdmulh v27.4S, v7.4S, v26.4S // ....................................................................................*...................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v13.4S, v12.4S, v8.S[0] // .....................................................................................*..................... + mls v9.4S, v23.4S, v8.S[0] // .....................................................................................*..................... // gap // ........................................................................................................... // gap // ........................................................................................................... - cmge v12.4S, v31.4S, v18.4S // ......................................................................................*.................... + cmge v23.4S, v31.4S, v20.4S // ......................................................................................*.................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v9.4S, v9.4S, v25.4S // .......................................................................................*................... - cmge v28.4S, v18.4S, v30.4S // ........................................................................................*.................. + mul v7.4S, v7.4S, v25.4S // .......................................................................................*................... + cmge v15.4S, v20.4S, v30.4S // ........................................................................................*.................. // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v9.4S, v19.4S, v8.S[0] // .........................................................................................*................. + mls v7.4S, v27.4S, v8.S[0] // .........................................................................................*................. // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v19.4S, v12.4S, v28.4S // ..........................................................................................*................ + sub v27.4S, v23.4S, v15.4S // ..........................................................................................*................ // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v17.4S, v5.4S, v29.4S // ..........................................................................*................................ - cmge v5.4S, v31.4S, v13.4S // ............................................................................................*.............. + mls v13.4S, v11.4S, v8.4S // ..........................................................................*................................ + cmge v11.4S, v31.4S, v9.4S // ............................................................................................*.............. // gap // ........................................................................................................... - cmge v12.4S, v13.4S, v30.4S // .............................................................................................*............. + cmge v23.4S, v9.4S, v30.4S // .............................................................................................*............. // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v22.4S, v27.4S, v29.4S // ...........................................................................................*............... + mls v18.4S, v21.4S, v8.4S // ...........................................................................................*............... // gap // ........................................................................................................... // gap // ........................................................................................................... - cmge v27.4S, v31.4S, v9.4S // ...............................................................................................*........... + cmge v21.4S, v31.4S, v7.4S // ...............................................................................................*........... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v4.4S, v15.4S, v29.4S // ..............................................................................................*............ - cmge v15.4S, v9.4S, v30.4S // .................................................................................................*......... + mls v16.4S, v24.4S, v8.4S // ..............................................................................................*............ + cmge v24.4S, v7.4S, v30.4S // .................................................................................................*......... // gap // ........................................................................................................... - str q17, [x0, #768] // .................................................................................*......................... - sub v5.4S, v5.4S, v12.4S // ...................................................................................................*....... + str q13, [x0, #768] // .................................................................................*......................... + sub v11.4S, v11.4S, v23.4S // ...................................................................................................*....... // gap // ........................................................................................................... - mls v18.4S, v19.4S, v29.4S // ................................................................................................*.......... + mls v20.4S, v27.4S, v8.4S // ................................................................................................*.......... // gap // ........................................................................................................... // gap // ........................................................................................................... - str q22, [x0, #896] // ..................................................................................................*........ - sub v19.4S, v27.4S, v15.4S // .....................................................................................................*..... + str q18, [x0, #896] // ..................................................................................................*........ + sub v27.4S, v21.4S, v24.4S // .....................................................................................................*..... // gap // ........................................................................................................... - mls v13.4S, v5.4S, v29.4S // ......................................................................................................*.... + mls v9.4S, v11.4S, v8.4S // ......................................................................................................*.... // gap // ........................................................................................................... // gap // ........................................................................................................... - str q4, [x0], #(16) // ....................................................................................................*...... + str q16, [x0], #(16) // ....................................................................................................*...... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v9.4S, v19.4S, v29.4S // ........................................................................................................*.. + mls v7.4S, v27.4S, v8.4S // ........................................................................................................*.. // gap // ........................................................................................................... // gap // ........................................................................................................... - str q18, [x0, #112] // .......................................................................................................*... + str q20, [x0, #112] // .......................................................................................................*... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - str q13, [x0, #240] // .........................................................................................................*. + str q9, [x0, #240] // .........................................................................................................*. // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - str q9, [x0, #368] // ..........................................................................................................* + str q7, [x0, #368] // ..........................................................................................................* // gap // ........................................................................................................... // gap // ........................................................................................................... - // original source code - // ldr q19, [x0, #0] // ..*........................................................................................................ - // ldr q5, [x0, #128] // ...*....................................................................................................... - // sub v9.4S, v13.4S, v17.4S // ....*...................................................................................................... - // add v15.4S, v13.4S, v17.4S // .....*..................................................................................................... - // mul v12.4S, v20.4S, v2.S[2] // *.......................................................................................................... - // add v18.4S, v22.4S, v4.4S // .*......................................................................................................... - // mul v28.4S, v9.4S, v2.S[0] // .........*................................................................................................. - // add v16.4S, v19.4S, v5.4S // ..........*................................................................................................ - // sub v19.4S, v19.4S, v5.4S // ...........*............................................................................................... - // sqrdmulh v5.4S, v9.4S, v2.S[1] // ............*.............................................................................................. - // sub v9.4S, v18.4S, v23.4S // .......*................................................................................................... - // mls v12.4S, v11.4S, v8.S[0] // ......*.................................................................................................... - // sub v11.4S, v16.4S, v15.4S // ................*.......................................................................................... - // add v15.4S, v16.4S, v15.4S // .................*......................................................................................... - // add v18.4S, v18.4S, v23.4S // ........*.................................................................................................. - // mul v16.4S, v19.4S, v1.S[2] // ...................*....................................................................................... - // mls v28.4S, v5.4S, v8.S[0] // ......................*.................................................................................... - // sub v5.4S, v12.4S, v21.4S // .............*............................................................................................. - // sqrdmulh v19.4S, v19.4S, v1.S[3] // .......................*................................................................................... - // add v12.4S, v12.4S, v21.4S // ..............*............................................................................................ - // sub v27.4S, v15.4S, v18.4S // ....................*...................................................................................... - // add v15.4S, v15.4S, v18.4S // .....................*..................................................................................... - // mul v18.4S, v11.4S, v0.S[2] // ........................*.................................................................................. - // sqrdmulh v11.4S, v11.4S, v0.S[3] // .........................*................................................................................. - // mul v21.4S, v9.4S, v1.S[0] // ...............*........................................................................................... - // sqrdmulh v9.4S, v9.4S, v1.S[1] // ..................*........................................................................................ - // mls v16.4S, v19.4S, v8.S[0] // ............................*.............................................................................. - // mls v18.4S, v11.4S, v8.S[0] // ..............................*............................................................................ - // mls v21.4S, v9.4S, v8.S[0] // .............................*............................................................................. - // sub v19.4S, v16.4S, v28.4S // ................................*.......................................................................... - // add v9.4S, v16.4S, v28.4S // ...............................*........................................................................... - // mul v28.4S, v5.4S, v1.S[0] // ..........................*................................................................................ - // mul v16.4S, v19.4S, v0.S[2] // .....................................*..................................................................... - // sub v11.4S, v9.4S, v12.4S // ...................................*....................................................................... - // add v9.4S, v9.4S, v12.4S // ....................................*...................................................................... - // sqrdmulh v19.4S, v19.4S, v0.S[3] // ........................................*.................................................................. - // sub v12.4S, v18.4S, v21.4S // ......................................*.................................................................... - // sqrdmulh v5.4S, v5.4S, v1.S[1] // ...........................*............................................................................... - // add v18.4S, v18.4S, v21.4S // .......................................*................................................................... - // mul v21.4S, v27.4S, v0.S[0] // .................................*......................................................................... - // mls v16.4S, v19.4S, v8.S[0] // .............................................*............................................................. - // mls v28.4S, v5.4S, v8.S[0] // ..................................*........................................................................ - // sqrdmulh v19.4S, v27.4S, v0.S[1] // .........................................*................................................................. - // mul v5.4S, v11.4S, v0.S[0] // ..........................................*................................................................ - // sub v27.4S, v16.4S, v28.4S // ................................................*.......................................................... - // add v28.4S, v16.4S, v28.4S // .................................................*......................................................... - // mul v16.4S, v15.4S, v25.4S // ...........................................*............................................................... - // sqrdmulh v15.4S, v15.4S, v26.4S // ............................................*.............................................................. - // mls v21.4S, v19.4S, v8.S[0] // ..............................................*............................................................ - // sqrdmulh v19.4S, v11.4S, v0.S[1] // ...............................................*........................................................... - // mul v11.4S, v12.4S, v0.S[0] // ..................................................*........................................................ - // cmge v24.4S, v31.4S, v21.4S // ...................................................*....................................................... - // sqrdmulh v12.4S, v12.4S, v0.S[1] // ....................................................*...................................................... - // cmge v7.4S, v21.4S, v30.4S // .....................................................*..................................................... - // mls v5.4S, v19.4S, v8.S[0] // ......................................................*.................................................... - // sub v19.4S, v24.4S, v7.4S // .......................................................*................................................... - // mul v24.4S, v27.4S, v0.S[0] // ........................................................*.................................................. - // mls v11.4S, v12.4S, v8.S[0] // .........................................................*................................................. - // cmge v12.4S, v31.4S, v5.4S // ..........................................................*................................................ - // sqrdmulh v27.4S, v27.4S, v0.S[1] // ...........................................................*............................................... - // cmge v7.4S, v5.4S, v30.4S // ............................................................*.............................................. - // mls v21.4S, v19.4S, v29.4S // .............................................................*............................................. - // sub v19.4S, v12.4S, v7.4S // ..............................................................*............................................ - // mls v16.4S, v15.4S, v8.S[0] // ...............................................................*........................................... - // cmge v12.4S, v31.4S, v11.4S // ................................................................*.......................................... - // cmge v15.4S, v11.4S, v30.4S // .................................................................*......................................... - // mls v24.4S, v27.4S, v8.S[0] // ..................................................................*........................................ - // str q21, [x0, #512] // ...................................................................*....................................... - // mls v5.4S, v19.4S, v29.4S // ....................................................................*...................................... - // sub v19.4S, v12.4S, v15.4S // .....................................................................*..................................... - // cmge v12.4S, v31.4S, v16.4S // ......................................................................*.................................... - // cmge v15.4S, v16.4S, v30.4S // .......................................................................*................................... - // mul v27.4S, v9.4S, v25.4S // ........................................................................*.................................. - // cmge v21.4S, v31.4S, v24.4S // .........................................................................*................................. - // mls v11.4S, v19.4S, v29.4S // .........................................................................................*................. - // cmge v19.4S, v24.4S, v30.4S // ...........................................................................*............................... - // str q5, [x0, #640] // ............................................................................*.............................. - // sub v5.4S, v12.4S, v15.4S // .............................................................................*............................. - // sqrdmulh v9.4S, v9.4S, v26.4S // ..........................................................................*................................ - // sub v19.4S, v21.4S, v19.4S // ...............................................................................*........................... - // mul v12.4S, v18.4S, v25.4S // ..............................................................................*............................ - // str q11, [x0, #768] // ................................................................................................*.......... - // sqrdmulh v15.4S, v18.4S, v26.4S // ................................................................................*.......................... - // mls v27.4S, v9.4S, v8.S[0] // .................................................................................*......................... - // sqrdmulh v9.4S, v28.4S, v26.4S // ..................................................................................*........................ - // mls v12.4S, v15.4S, v8.S[0] // ...................................................................................*....................... - // cmge v15.4S, v31.4S, v27.4S // ....................................................................................*...................... - // mul v18.4S, v28.4S, v25.4S // .....................................................................................*..................... - // cmge v28.4S, v27.4S, v30.4S // ......................................................................................*.................... - // mls v18.4S, v9.4S, v8.S[0] // .......................................................................................*................... - // sub v9.4S, v15.4S, v28.4S // ........................................................................................*.................. - // mls v24.4S, v19.4S, v29.4S // ............................................................................................*.............. - // cmge v19.4S, v31.4S, v12.4S // ..........................................................................................*................ - // cmge v15.4S, v12.4S, v30.4S // ...........................................................................................*............... - // mls v16.4S, v5.4S, v29.4S // ..............................................................................................*............ - // cmge v5.4S, v31.4S, v18.4S // .............................................................................................*............. - // mls v27.4S, v9.4S, v29.4S // ..................................................................................................*........ - // cmge v9.4S, v18.4S, v30.4S // ...............................................................................................*........... - // str q24, [x0, #896] // ...................................................................................................*....... - // sub v19.4S, v19.4S, v15.4S // .................................................................................................*......... - // str q16, [x0], #(16) // ......................................................................................................*.... - // sub v5.4S, v5.4S, v9.4S // ....................................................................................................*...... - // mls v12.4S, v19.4S, v29.4S // .....................................................................................................*..... - // str q27, [x0, #112] // ........................................................................................................*.. - // mls v18.4S, v5.4S, v29.4S // .......................................................................................................*... - // str q12, [x0, #240] // .........................................................................................................*. - // str q18, [x0, #368] // ..........................................................................................................* + // ---------------------------------------------- new position ----------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------ + // ldr q27, [x0, #0] // ..*........................................................................................................ + // ldr q11, [x0, #128] // ...*....................................................................................................... + // sub v7.4S, v13.4S, v20.4S // ....*...................................................................................................... + // mul v24.4S, v5.4S, v2.S[2] // *.......................................................................................................... + // add v23.4S, v13.4S, v20.4S // .....*..................................................................................................... + // add v9.4S, v18.4S, v16.4S // .*......................................................................................................... + // mul v15.4S, v7.4S, v2.S[0] // .........*................................................................................................. + // sub v5.4S, v27.4S, v11.4S // ..........*................................................................................................ + // add v27.4S, v27.4S, v11.4S // ...........*............................................................................................... + // sqrdmulh v11.4S, v7.4S, v2.S[1] // ............*.............................................................................................. + // sub v7.4S, v9.4S, v14.4S // .......*................................................................................................... + // mls v24.4S, v4.4S, v8.S[0] // ......*.................................................................................................... + // add v9.4S, v9.4S, v14.4S // ........*.................................................................................................. + // sub v4.4S, v27.4S, v23.4S // ................*.......................................................................................... + // add v27.4S, v27.4S, v23.4S // .................*......................................................................................... + // mul v23.4S, v5.4S, v1.S[2] // ...............*........................................................................................... + // mls v15.4S, v11.4S, v8.S[0] // ......................*.................................................................................... + // sub v11.4S, v24.4S, v19.4S // .............*............................................................................................. + // sqrdmulh v21.4S, v5.4S, v1.S[3] // .......................*................................................................................... + // add v24.4S, v24.4S, v19.4S // ..............*............................................................................................ + // sub v17.4S, v27.4S, v9.4S // ....................*...................................................................................... + // add v27.4S, v27.4S, v9.4S // .....................*..................................................................................... + // mul v9.4S, v4.4S, v0.S[2] // ........................*.................................................................................. + // sqrdmulh v4.4S, v4.4S, v0.S[3] // .........................*................................................................................. + // mul v19.4S, v7.4S, v1.S[0] // ..................*........................................................................................ + // sqrdmulh v7.4S, v7.4S, v1.S[1] // ...................*....................................................................................... + // mls v23.4S, v21.4S, v8.S[0] // ............................*.............................................................................. + // mls v9.4S, v4.4S, v8.S[0] // .............................*............................................................................. + // mls v19.4S, v7.4S, v8.S[0] // ..............................*............................................................................ + // sub v7.4S, v23.4S, v15.4S // ................................*.......................................................................... + // add v23.4S, v23.4S, v15.4S // ...............................*........................................................................... + // mul v21.4S, v11.4S, v1.S[0] // ..........................*................................................................................ + // mul v15.4S, v7.4S, v0.S[2] // .....................................*..................................................................... + // sub v4.4S, v23.4S, v24.4S // ...................................*....................................................................... + // add v23.4S, v23.4S, v24.4S // ....................................*...................................................................... + // sqrdmulh v7.4S, v7.4S, v0.S[3] // ........................................*.................................................................. + // sub v24.4S, v9.4S, v19.4S // ......................................*.................................................................... + // sqrdmulh v11.4S, v11.4S, v1.S[1] // ...........................*............................................................................... + // add v9.4S, v9.4S, v19.4S // .......................................*................................................................... + // mul v19.4S, v17.4S, v0.S[0] // .................................*......................................................................... + // mls v15.4S, v7.4S, v8.S[0] // .............................................*............................................................. + // mls v21.4S, v11.4S, v8.S[0] // ..................................*........................................................................ + // sqrdmulh v11.4S, v17.4S, v0.S[1] // .........................................*................................................................. + // mul v7.4S, v4.4S, v0.S[0] // ..........................................*................................................................ + // sub v17.4S, v15.4S, v21.4S // ................................................*.......................................................... + // add v21.4S, v15.4S, v21.4S // .................................................*......................................................... + // mul v15.4S, v27.4S, v25.4S // ...........................................*............................................................... + // sqrdmulh v27.4S, v27.4S, v26.4S // ............................................*.............................................................. + // mls v19.4S, v11.4S, v8.S[0] // ..............................................*............................................................ + // sqrdmulh v11.4S, v4.4S, v0.S[1] // ...............................................*........................................................... + // mul v4.4S, v24.4S, v0.S[0] // ..................................................*........................................................ + // cmge v29.4S, v31.4S, v19.4S // ...................................................*....................................................... + // sqrdmulh v24.4S, v24.4S, v0.S[1] // ....................................................*...................................................... + // cmge v6.4S, v19.4S, v30.4S // .....................................................*..................................................... + // mls v7.4S, v11.4S, v8.S[0] // ......................................................*.................................................... + // sub v11.4S, v29.4S, v6.4S // .......................................................*................................................... + // mul v29.4S, v17.4S, v0.S[0] // ........................................................*.................................................. + // mls v4.4S, v24.4S, v8.S[0] // .........................................................*................................................. + // cmge v24.4S, v31.4S, v7.4S // ..........................................................*................................................ + // sqrdmulh v17.4S, v17.4S, v0.S[1] // ...........................................................*............................................... + // cmge v6.4S, v7.4S, v30.4S // ............................................................*.............................................. + // mls v19.4S, v11.4S, v8.4S // .............................................................*............................................. + // sub v11.4S, v24.4S, v6.4S // ..............................................................*............................................ + // mls v15.4S, v27.4S, v8.S[0] // ...............................................................*........................................... + // cmge v27.4S, v31.4S, v4.4S // ................................................................*.......................................... + // cmge v24.4S, v4.4S, v30.4S // .................................................................*......................................... + // mls v29.4S, v17.4S, v8.S[0] // ..................................................................*........................................ + // str q19, [x0, #512] // ...................................................................*....................................... + // mls v7.4S, v11.4S, v8.4S // ....................................................................*...................................... + // sub v27.4S, v27.4S, v24.4S // .....................................................................*..................................... + // cmge v11.4S, v31.4S, v15.4S // ......................................................................*.................................... + // cmge v24.4S, v15.4S, v30.4S // .......................................................................*................................... + // mul v17.4S, v23.4S, v25.4S // ........................................................................*.................................. + // cmge v19.4S, v31.4S, v29.4S // .........................................................................*................................. + // mls v4.4S, v27.4S, v8.4S // .........................................................................................*................. + // cmge v27.4S, v29.4S, v30.4S // ...........................................................................*............................... + // str q7, [x0, #640] // ............................................................................*.............................. + // sub v11.4S, v11.4S, v24.4S // .............................................................................*............................. + // sqrdmulh v7.4S, v23.4S, v26.4S // ..........................................................................*................................ + // sub v27.4S, v19.4S, v27.4S // ...............................................................................*........................... + // mul v23.4S, v9.4S, v25.4S // ..............................................................................*............................ + // str q4, [x0, #768] // ................................................................................................*.......... + // sqrdmulh v24.4S, v9.4S, v26.4S // ................................................................................*.......................... + // mls v17.4S, v7.4S, v8.S[0] // .................................................................................*......................... + // sqrdmulh v7.4S, v21.4S, v26.4S // ..................................................................................*........................ + // mls v23.4S, v24.4S, v8.S[0] // ...................................................................................*....................... + // cmge v24.4S, v31.4S, v17.4S // ....................................................................................*...................... + // mul v9.4S, v21.4S, v25.4S // .....................................................................................*..................... + // cmge v21.4S, v17.4S, v30.4S // ......................................................................................*.................... + // mls v9.4S, v7.4S, v8.S[0] // .......................................................................................*................... + // sub v7.4S, v24.4S, v21.4S // ........................................................................................*.................. + // mls v29.4S, v27.4S, v8.4S // ............................................................................................*.............. + // cmge v27.4S, v31.4S, v23.4S // ..........................................................................................*................ + // cmge v24.4S, v23.4S, v30.4S // ...........................................................................................*............... + // mls v15.4S, v11.4S, v8.4S // ..............................................................................................*............ + // cmge v11.4S, v31.4S, v9.4S // .............................................................................................*............. + // mls v17.4S, v7.4S, v8.4S // ..................................................................................................*........ + // cmge v7.4S, v9.4S, v30.4S // ...............................................................................................*........... + // str q29, [x0, #896] // ...................................................................................................*....... + // sub v27.4S, v27.4S, v24.4S // .................................................................................................*......... + // str q15, [x0], #(16) // ......................................................................................................*.... + // sub v11.4S, v11.4S, v7.4S // ....................................................................................................*...... + // mls v23.4S, v27.4S, v8.4S // .....................................................................................................*..... + // str q17, [x0, #112] // ........................................................................................................*.. + // mls v9.4S, v11.4S, v8.4S // .......................................................................................................*... + // str q23, [x0, #240] // .........................................................................................................*. + // str q9, [x0, #368] // ..........................................................................................................* pop_stack diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s index 0c486d5..844e4e4 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s @@ -67,7 +67,7 @@ xtmp1 .req x11 cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s - vmls \a, \tmp2, modulus + vmls \a, \tmp2, consts .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -76,12 +76,6 @@ xtmp1 .req x11 mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.4s, \a\().4s, \b\().4s add \a\().4s, \a\().4s, \b\().4s @@ -193,7 +187,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -204,7 +198,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -214,7 +208,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -222,7 +216,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -233,19 +227,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -371,8 +365,6 @@ _intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm: consts .req v8 qform_consts .req q8 - modulus .req v29 - ASM_LOAD(r_ptr0, roots_l345) ASM_LOAD(r_ptr1, roots_l67) @@ -395,190 +387,240 @@ _intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm: qform_root3_tw .req q7 .p2align 2 - ldr q0, [x1, #32] // *. - ldr q18, [x1, #48] // .* - // gap // .. - // gap // .. - // gap // .. - // gap // .. - // gap // .. - // gap // .. - - // original source code - // ldr q0, [x1, #32] // *. - // ldr q18, [x1, #48] // .* + // Instructions: 10 + // Expected cycles: 4 + // Expected IPC: 2.50 + // + // Wall time: 0.09s + // User time: 0.09s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q29, [x1, #0] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q11, [x5, #96] // .........*.................... + ldr q0, [x5, #32] // *............................. + ldr q24, [x5], #(12*16) // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q19, [x1, #32] // ........*..................... + // gap // .............................. + ldr q2, [x5, #-112] // ...*.......................... + // gap // .............................. + ldr q30, [x5, #-128] // ....*......................... + ldr q7, [x1, #48] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q15, [x5, #-144] // ..*........................... + ldr q6, [x1, #16] // .......*...................... + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q0, [x5, #32] // ..*............................ + // ldr q24, [x5], #(12*16) // ...*........................... + // ldr q15, [x5, #-144] // ........*...................... + // ldr q2, [x5, #-112] // .....*......................... + // ldr q30, [x5, #-128] // ......*........................ + // ldr q29, [x1, #0] // *.............................. + // ldr q7, [x1, #48] // .......*....................... + // ldr q6, [x1, #16] // .........*..................... + // ldr q19, [x1, #32] // ....*.......................... + // ldr q11, [x5, #-96] // .*............................. sub count, count, #1 layer45678_start: - ldr q29, [x1, #0] // *............................................................................................................................................................................. - ldr q19, [x1, #16] // .*............................................................................................................................................................................ - ldr q5, [x2, #0] // ............*................................................................................................................................................................. - trn1 v9.4S, v0.4S, v18.4S // ......*....................................................................................................................................................................... - trn2 v0.4S, v0.4S, v18.4S // .......*...................................................................................................................................................................... + // Instructions: 174 + // Expected cycles: 54 + // Expected IPC: 3.22 + // + // Wall time: 209.71s + // User time: 209.71s + // + // ----------------------------------------------------------------------------- original position -----------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + ldr q10, [x2, #32] // ..............*............................................................................................................................................................... + trn2 v25.4S, v29.4S, v6.4S // .....*........................................................................................................................................................................ + trn2 v22.4S, v19.4S, v7.4S // .......*...................................................................................................................................................................... + ldr q14, [x2, #16] // .............*................................................................................................................................................................ + ldr q20, [x2, #0] // ............*................................................................................................................................................................. + trn1 v23.4S, v29.4S, v6.4S // ....*......................................................................................................................................................................... + trn1 v4.4S, v19.4S, v7.4S // ......*....................................................................................................................................................................... // gap // .............................................................................................................................................................................. + ldr q19, [x2, #48] // ...............*.............................................................................................................................................................. + ldr q26, [x5, #-64] // ....................................................*......................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q2, [x2, #16] // .............*................................................................................................................................................................ - ldr q12, [x2, #32] // ..............*............................................................................................................................................................... - ldr q15, [x2, #48] // ...............*.............................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn1 v17.2D, v23.2D, v4.2D // ..........*................................................................................................................................................................... + trn2 v18.2D, v23.2D, v4.2D // ........*..................................................................................................................................................................... + trn1 v5.2D, v25.2D, v22.2D // ...........*.................................................................................................................................................................. + trn2 v7.2D, v25.2D, v22.2D // .........*.................................................................................................................................................................... + ldr q31, [x5, #-16] // .......................................................*...................................................................................................................... // gap // .............................................................................................................................................................................. - ldr q13, [x5, #32] // ..........................*................................................................................................................................................... - ldr q18, [x5], #(12*16) // ........................*..................................................................................................................................................... - ldr q27, [x5, #-176] // .........................*.................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q17, [x5, #-144] // ...........................*.................................................................................................................................................. - ldr q31, [x5, #-128] // ............................*................................................................................................................................................. - ldr q25, [x5, #-112] // .............................*................................................................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v28.4S, v29.4S, v19.4S // ....*......................................................................................................................................................................... - trn2 v29.4S, v29.4S, v19.4S // .....*........................................................................................................................................................................ - ldr q19, [x5, #-64] // ....................................................*......................................................................................................................... - ldr q3, [x5, #-96] // ..................................................*........................................................................................................................... - ldr q22, [x5, #-80] // ...................................................*.......................................................................................................................... + trn2 v28.4S, v20.4S, v14.4S // .................*............................................................................................................................................................ + sub v22.4S, v18.4S, v7.4S // ...................................*.......................................................................................................................................... + sub v9.4S, v17.4S, v5.4S // ..............................*............................................................................................................................................... + add v16.4S, v17.4S, v5.4S // ...............................*.............................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v1.4S, v5.4S, v2.4S // ................*............................................................................................................................................................. - trn2 v5.4S, v5.4S, v2.4S // .................*............................................................................................................................................................ - trn1 v2.4S, v12.4S, v15.4S // ..................*........................................................................................................................................................... - trn2 v12.4S, v12.4S, v15.4S // ...................*.......................................................................................................................................................... - ldr q15, [x5, #-48] // .....................................................*........................................................................................................................ - ldr q4, [x5, #-32] // ......................................................*....................................................................................................................... - ldr q16, [x5, #-16] // .......................................................*...................................................................................................................... // gap // .............................................................................................................................................................................. - trn2 v20.2D, v28.2D, v9.2D // ........*..................................................................................................................................................................... - trn1 v9.2D, v28.2D, v9.2D // ..........*................................................................................................................................................................... - trn2 v28.2D, v29.2D, v0.2D // .........*.................................................................................................................................................................... - trn1 v29.2D, v29.2D, v0.2D // ...........*.................................................................................................................................................................. - ldr q0, [x4, #32] // ..............................................................................................*............................................................................... - ldr q11, [x4, #16] // .............................................................................................*................................................................................ - ldr q6, [x4], #64 // ............................................................................................*................................................................................. + trn1 v21.4S, v10.4S, v19.4S // ..................*........................................................................................................................................................... + trn2 v6.4S, v10.4S, v19.4S // ...................*.......................................................................................................................................................... + trn1 v19.4S, v20.4S, v14.4S // ................*............................................................................................................................................................. + add v4.4S, v18.4S, v7.4S // ....................................*......................................................................................................................................... + ldr q14, [x5, #-80] // ...................................................*.......................................................................................................................... // gap // .............................................................................................................................................................................. - trn2 v23.2D, v1.2D, v2.2D // ....................*......................................................................................................................................................... - trn1 v2.2D, v1.2D, v2.2D // ......................*....................................................................................................................................................... - trn2 v1.2D, v5.2D, v12.2D // .....................*........................................................................................................................................................ - trn1 v5.2D, v5.2D, v12.2D // .......................*...................................................................................................................................................... - ldr q12, [x4, #-16] // ...............................................................................................*.............................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mul v0.4S, v9.4S, v0.4S // ................................*............................................................................................................................................. + sqrdmulh v29.4S, v9.4S, v15.4S // .................................*............................................................................................................................................ + sqrdmulh v17.4S, v22.4S, v2.4S // ......................................*....................................................................................................................................... + mul v22.4S, v22.4S, v30.4S // .....................................*........................................................................................................................................ + ldr q30, [x5, #-48] // .....................................................*........................................................................................................................ // gap // .............................................................................................................................................................................. - sub v21.4S, v20.4S, v28.4S // ...................................*.......................................................................................................................................... - add v28.4S, v20.4S, v28.4S // ....................................*......................................................................................................................................... - sub v20.4S, v9.4S, v29.4S // ..............................*............................................................................................................................................... - add v29.4S, v9.4S, v29.4S // ...............................*.............................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn2 v20.2D, v28.2D, v6.2D // .....................*........................................................................................................................................................ + trn1 v28.2D, v28.2D, v6.2D // .......................*...................................................................................................................................................... + trn2 v10.2D, v19.2D, v21.2D // ....................*......................................................................................................................................................... + trn1 v21.2D, v19.2D, v21.2D // ......................*....................................................................................................................................................... + ldr q6, [x5, #-32] // ......................................................*....................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v9.4S, v23.4S, v1.4S // .............................................................*................................................................................................................ - add v1.4S, v23.4S, v1.4S // ..............................................................*............................................................................................................... - sub v23.4S, v2.4S, v5.4S // ........................................................*..................................................................................................................... - add v5.4S, v2.4S, v5.4S // .........................................................*.................................................................................................................... // gap // .............................................................................................................................................................................. + ldr q18, [x5, #-176] // .........................*.................................................................................................................................................... + sub v12.4S, v16.4S, v4.4S // ........................................*..................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v2.4S, v20.4S, v13.4S // ................................*............................................................................................................................................. - sqrdmulh v13.4S, v20.4S, v17.4S // .................................*............................................................................................................................................ - mul v17.4S, v21.4S, v31.4S // .....................................*........................................................................................................................................ - sqrdmulh v31.4S, v21.4S, v25.4S // ......................................*....................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v22.4S, v17.4S, v8.S[0] // .......................................*...................................................................................................................................... + ldr q17, [x4, #16] // .............................................................................................*................................................................................ + sub v13.4S, v21.4S, v28.4S // ........................................................*..................................................................................................................... + sub v27.4S, v10.4S, v20.4S // .............................................................*................................................................................................................ + mls v0.4S, v29.4S, v8.S[0] // ..................................*........................................................................................................................................... // gap // .............................................................................................................................................................................. - mul v19.4S, v23.4S, v19.4S // ..........................................................*................................................................................................................... - sqrdmulh v15.4S, v23.4S, v15.4S // ...........................................................*.................................................................................................................. - mul v25.4S, v9.4S, v4.4S // ...............................................................*.............................................................................................................. - sqrdmulh v9.4S, v9.4S, v16.4S // ................................................................*............................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + add v2.4S, v21.4S, v28.4S // .........................................................*.................................................................................................................... + add v28.4S, v10.4S, v20.4S // ..............................................................*............................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v4.4S, v5.4S, v1.4S // ..................................................................*........................................................................................................... - add v5.4S, v5.4S, v1.4S // ...................................................................*.......................................................................................................... - sub v1.4S, v29.4S, v28.4S // ........................................*..................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + sqrdmulh v19.4S, v13.4S, v30.4S // ...........................................................*.................................................................................................................. + mul v29.4S, v13.4S, v26.4S // ..........................................................*................................................................................................................... + mul v30.4S, v27.4S, v6.4S // ...............................................................*.............................................................................................................. + sqrdmulh v6.4S, v27.4S, v31.4S // ................................................................*............................................................................................................. // gap // .............................................................................................................................................................................. - add v29.4S, v29.4S, v28.4S // .........................................*.................................................................................................................................... - mls v2.4S, v13.4S, v8.S[0] // ..................................*........................................................................................................................................... - mls v17.4S, v31.4S, v8.S[0] // .......................................*...................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + add v21.4S, v0.4S, v22.4S // ..............................................*............................................................................................................................... + sub v31.4S, v0.4S, v22.4S // .............................................*................................................................................................................................ + ldr q0, [x5, #32] // ..........................e................................................................................................................................................... + sub v10.4S, v2.4S, v28.4S // ..................................................................*........................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v19.4S, v15.4S, v8.S[0] // ............................................................*................................................................................................................. - mls v25.4S, v9.4S, v8.S[0] // .................................................................*............................................................................................................ - mul v9.4S, v1.4S, v18.4S // ..........................................*................................................................................................................................... - sqrdmulh v15.4S, v1.4S, v27.4S // ...........................................*.................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mul v27.4S, v12.4S, v24.4S // ..........................................*................................................................................................................................... + sqrdmulh v12.4S, v12.4S, v18.4S // ...........................................*.................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v13.4S, v4.4S, v3.4S // ....................................................................*......................................................................................................... - sqrdmulh v31.4S, v4.4S, v22.4S // .....................................................................*........................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v30.4S, v6.4S, v8.S[0] // .................................................................*............................................................................................................ + ldr q6, [x4, #32] // ..............................................................................................*............................................................................... + sqrdmulh v3.4S, v31.4S, v18.4S // ................................................*............................................................................................................................. + mls v29.4S, v19.4S, v8.S[0] // ............................................................*................................................................................................................. + mul v19.4S, v31.4S, v24.4S // ...............................................*.............................................................................................................................. + ldr q24, [x5], #(12*16) // ........................e..................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v28.4S, v2.4S, v17.4S // .............................................*................................................................................................................................ - add v2.4S, v2.4S, v17.4S // ..............................................*............................................................................................................................... + add v31.4S, v16.4S, v4.4S // .........................................*.................................................................................................................................... + ldr q16, [x4], #64 // ............................................................................................*................................................................................. + mul v22.4S, v10.4S, v11.4S // ....................................................................*......................................................................................................... + sqrdmulh v10.4S, v10.4S, v14.4S // .....................................................................*........................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v27.4S, v12.4S, v8.S[0] // ............................................*................................................................................................................................. + add v12.4S, v2.4S, v28.4S // ...................................................................*.......................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v17.4S, v19.4S, v25.4S // .......................................................................*...................................................................................................... - add v19.4S, v19.4S, v25.4S // ........................................................................*..................................................................................................... - mls v9.4S, v15.4S, v8.S[0] // ............................................*................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v19.4S, v3.4S, v8.S[0] // .................................................*............................................................................................................................ + sub v1.4S, v29.4S, v30.4S // .......................................................................*...................................................................................................... + add v28.4S, v29.4S, v30.4S // ........................................................................*..................................................................................................... // gap // .............................................................................................................................................................................. - mul v15.4S, v28.4S, v18.4S // ...............................................*.............................................................................................................................. - sqrdmulh v18.4S, v28.4S, v27.4S // ................................................*............................................................................................................................. - mls v13.4S, v31.4S, v8.S[0] // ......................................................................*....................................................................................................... - trn1 v27.4S, v29.4S, v2.4S // ............................................................................*................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v31.4S, v17.4S, v3.4S // .........................................................................*.................................................................................................... - sqrdmulh v17.4S, v17.4S, v22.4S // ..........................................................................*................................................................................................... - trn2 v29.4S, v29.4S, v2.4S // .............................................................................*................................................................................................ - trn1 v2.4S, v5.4S, v19.4S // ....................................................................................*......................................................................................... + mls v22.4S, v10.4S, v8.S[0] // ......................................................................*....................................................................................................... + trn2 v30.4S, v31.4S, v21.4S // .............................................................................*................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v19.4S, v5.4S, v19.4S // .....................................................................................*........................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn1 v25.4S, v12.4S, v28.4S // ....................................................................................*......................................................................................... + trn2 v28.4S, v12.4S, v28.4S // .....................................................................................*........................................................................................ + sqrdmulh v10.4S, v1.4S, v14.4S // ..........................................................................*................................................................................................... + mul v5.4S, v1.4S, v11.4S // .........................................................................*.................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn2 v26.4S, v27.4S, v19.4S // ...............................................................................*.............................................................................................. + trn1 v27.4S, v27.4S, v19.4S // ..............................................................................*............................................................................................... + trn1 v19.4S, v31.4S, v21.4S // ............................................................................*................................................................................................. + ldr q31, [x4, #-16] // ...............................................................................................*.............................................................................. // gap // .............................................................................................................................................................................. - mls v15.4S, v18.4S, v8.S[0] // .................................................*............................................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. @@ -586,15 +628,19 @@ layer45678_start: // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v31.4S, v17.4S, v8.S[0] // ...........................................................................*.................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v5.4S, v10.4S, v8.S[0] // ...........................................................................*.................................................................................................. + trn2 v10.2D, v19.2D, v27.2D // ................................................................................*............................................................................................. + trn1 v19.2D, v19.2D, v27.2D // ..................................................................................*........................................................................................... + trn2 v29.2D, v30.2D, v26.2D // .................................................................................*............................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn1 v4.2D, v30.2D, v26.2D // ...................................................................................*.......................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. @@ -602,219 +648,219 @@ layer45678_start: // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v5.4S, v9.4S, v15.4S // ..............................................................................*............................................................................................... - trn2 v9.4S, v9.4S, v15.4S // ...............................................................................*.............................................................................................. + sub v12.4S, v10.4S, v29.4S // .....................................................................................................*........................................................................ + add v23.4S, v10.4S, v29.4S // ......................................................................................................*....................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v15.4S, v13.4S, v31.4S // ......................................................................................*....................................................................................... - trn2 v13.4S, v13.4S, v31.4S // .......................................................................................*...................................................................................... + trn1 v30.4S, v22.4S, v5.4S // ......................................................................................*....................................................................................... + trn2 v9.4S, v22.4S, v5.4S // .......................................................................................*...................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v18.2D, v27.2D, v5.2D // ................................................................................*............................................................................................. - trn1 v5.2D, v27.2D, v5.2D // ..................................................................................*........................................................................................... - trn2 v27.2D, v29.2D, v9.2D // .................................................................................*............................................................................................ - trn1 v29.2D, v29.2D, v9.2D // ...................................................................................*.......................................................................................... + mul v14.4S, v12.4S, v6.S[0] // .......................................................................................................*...................................................................... + add v18.4S, v19.4S, v4.4S // .................................................................................................*............................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v9.2D, v2.2D, v15.2D // ........................................................................................*..................................................................................... - trn1 v2.2D, v2.2D, v15.2D // ..........................................................................................*................................................................................... - trn2 v15.2D, v19.2D, v13.2D // .........................................................................................*.................................................................................... - trn1 v19.2D, v19.2D, v13.2D // ...........................................................................................*.................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn2 v3.2D, v25.2D, v30.2D // ........................................................................................*..................................................................................... + trn1 v10.2D, v28.2D, v9.2D // ...........................................................................................*.................................................................................. + trn2 v7.2D, v28.2D, v9.2D // .........................................................................................*.................................................................................... + trn1 v11.2D, v25.2D, v30.2D // ..........................................................................................*................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v13.4S, v18.4S, v27.4S // .....................................................................................................*........................................................................ - add v18.4S, v18.4S, v27.4S // ......................................................................................................*....................................................................... - sub v27.4S, v5.4S, v29.4S // ................................................................................................*............................................................................. - add v29.4S, v5.4S, v29.4S // .................................................................................................*............................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + sub v28.4S, v19.4S, v4.4S // ................................................................................................*............................................................................. + sqrdmulh v5.4S, v12.4S, v6.S[1] // ........................................................................................................*..................................................................... + add v1.4S, v18.4S, v23.4S // .....................................................................................................................*........................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v5.4S, v9.4S, v15.4S // ...............................................................................................................*.............................................................. - add v15.4S, v9.4S, v15.4S // ................................................................................................................*............................................................. - sub v17.4S, v2.4S, v19.4S // ..........................................................................................................*................................................................... - add v19.4S, v2.4S, v19.4S // ...........................................................................................................*.................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + sub v29.4S, v3.4S, v7.4S // ...............................................................................................................*.............................................................. + add v27.4S, v3.4S, v7.4S // ................................................................................................................*............................................................. + sub v12.4S, v11.4S, v10.4S // ..........................................................................................................*................................................................... + add v7.4S, v11.4S, v10.4S // ...........................................................................................................*.................................................................. // gap // .............................................................................................................................................................................. - mul v2.4S, v13.4S, v0.S[0] // .......................................................................................................*...................................................................... - sqrdmulh v13.4S, v13.4S, v0.S[1] // ........................................................................................................*..................................................................... - mul v9.4S, v27.4S, v11.S[2] // ..................................................................................................*........................................................................... - sqrdmulh v27.4S, v27.4S, v11.S[3] // ...................................................................................................*.......................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mul v22.4S, v28.4S, v17.S[2] // ..................................................................................................*........................................................................... + sqrdmulh v19.4S, v28.4S, v17.S[3] // ...................................................................................................*.......................................................................... + srshr v10.4S, v1.4S, #23 // ........................................................................................................................................*..................................... // gap // .............................................................................................................................................................................. - mul v31.4S, v17.4S, v0.S[2] // ............................................................................................................*................................................................. - sqrdmulh v0.4S, v17.4S, v0.S[3] // .............................................................................................................*................................................................ - mul v17.4S, v5.4S, v12.S[0] // .................................................................................................................*............................................................ - sqrdmulh v5.4S, v5.4S, v12.S[1] // ..................................................................................................................*........................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v12.4S, v29.4S, v18.4S // ....................................................................................................................*......................................................... - add v29.4S, v29.4S, v18.4S // .....................................................................................................................*........................................................ - sub v18.4S, v19.4S, v15.4S // ..............................................................................................................................*............................................... - add v19.4S, v19.4S, v15.4S // ...............................................................................................................................*.............................................. + mul v30.4S, v12.4S, v6.S[2] // ............................................................................................................*................................................................. + sqrdmulh v6.4S, v12.4S, v6.S[3] // .............................................................................................................*................................................................ + mul v20.4S, v29.4S, v31.S[0] // .................................................................................................................*............................................................ + sqrdmulh v15.4S, v29.4S, v31.S[1] // ..................................................................................................................*........................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v9.4S, v27.4S, v8.S[0] // ....................................................................................................*......................................................................... - mls v2.4S, v13.4S, v8.S[0] // .........................................................................................................*.................................................................... + mls v14.4S, v5.4S, v8.S[0] // .........................................................................................................*.................................................................... + sub v2.4S, v7.4S, v27.4S // ..............................................................................................................................*............................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v31.4S, v0.4S, v8.S[0] // ..............................................................................................................*............................................................... - mls v17.4S, v5.4S, v8.S[0] // ...................................................................................................................*.......................................................... - mul v5.4S, v12.4S, v6.S[2] // ......................................................................................................................*....................................................... - sqrdmulh v0.4S, v12.4S, v6.S[3] // .......................................................................................................................*...................................................... + mls v1.4S, v10.4S, v8.4S // .........................................................................................................................................*.................................... + mls v22.4S, v19.4S, v8.S[0] // ....................................................................................................*......................................................................... + add v19.4S, v7.4S, v27.4S // ...............................................................................................................................*.............................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v12.4S, v18.4S, v11.S[0] // ................................................................................................................................*............................................. - sqrdmulh v15.4S, v18.4S, v11.S[1] // .................................................................................................................................*............................................ - srshr v13.4S, v29.4S, #23 // ........................................................................................................................................*..................................... - srshr v18.4S, v19.4S, #23 // ............................................................................................................................................*................................. // gap // .............................................................................................................................................................................. + mls v20.4S, v15.4S, v8.S[0] // ...................................................................................................................*.......................................................... + mls v30.4S, v6.4S, v8.S[0] // ..............................................................................................................*............................................................... + sub v12.4S, v18.4S, v23.4S // ....................................................................................................................*......................................................... + sqrdmulh v11.4S, v2.4S, v17.S[1] // .................................................................................................................................*............................................ + ldr q15, [x5, #-144] // ...........................e.................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v27.4S, v9.4S, v2.4S // .........................................................................................................................*.................................................... - add v9.4S, v9.4S, v2.4S // ..........................................................................................................................*................................................... + srshr v10.4S, v19.4S, #23 // ............................................................................................................................................*................................. + mul v29.4S, v2.4S, v17.S[0] // ................................................................................................................................*............................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v2.4S, v31.4S, v17.4S // ...................................................................................................................................*.......................................... - add v17.4S, v31.4S, v17.4S // ....................................................................................................................................*......................................... - mls v5.4S, v0.4S, v8.S[0] // ........................................................................................................................*..................................................... + sub v28.4S, v22.4S, v14.4S // .........................................................................................................................*.................................................... + add v4.4S, v22.4S, v14.4S // ..........................................................................................................................*................................................... + mul v5.4S, v12.4S, v16.S[2] // ......................................................................................................................*....................................................... + sqrdmulh v27.4S, v12.4S, v16.S[3] // .......................................................................................................................*...................................................... + ldr q2, [x5, #-112] // .............................e................................................................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + sub v31.4S, v30.4S, v20.4S // ...................................................................................................................................*.......................................... + add v18.4S, v30.4S, v20.4S // ....................................................................................................................................*......................................... + ldr q30, [x5, #-128] // ............................e................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v29.4S, v13.4S, v8.4S // .........................................................................................................................................*.................................... - mls v12.4S, v15.4S, v8.S[0] // ..................................................................................................................................*........................................... - mul v0.4S, v27.4S, v6.S[2] // ...........................................................................................................................*.................................................. - sqrdmulh v15.4S, v27.4S, v6.S[3] // ............................................................................................................................*................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v19.4S, v10.4S, v8.4S // .............................................................................................................................................*................................ + mul v22.4S, v28.4S, v16.S[2] // ...........................................................................................................................*.................................................. + sqrdmulh v14.4S, v28.4S, v16.S[3] // ............................................................................................................................*................................................. + srshr v28.4S, v4.4S, #23 // ..........................................................................................................................................*................................... // gap // .............................................................................................................................................................................. - mul v13.4S, v2.4S, v11.S[0] // .....................................................................................................................................*........................................ - sqrdmulh v2.4S, v2.4S, v11.S[1] // ......................................................................................................................................*....................................... - srshr v27.4S, v9.4S, #23 // ..........................................................................................................................................*................................... - srshr v31.4S, v17.4S, #23 // ..............................................................................................................................................*............................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mul v6.4S, v31.4S, v17.S[0] // .....................................................................................................................................*........................................ + sqrdmulh v31.4S, v31.4S, v17.S[1] // ......................................................................................................................................*....................................... + mls v5.4S, v27.4S, v8.S[0] // ........................................................................................................................*..................................................... + srshr v12.4S, v18.4S, #23 // ..............................................................................................................................................*............................... // gap // .............................................................................................................................................................................. - mls v19.4S, v18.4S, v8.4S // .............................................................................................................................................*................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v29.4S, v11.4S, v8.S[0] // ..................................................................................................................................*........................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - add v18.4S, v5.4S, v12.4S // ...........................................................................................................................................................*.................. - sub v5.4S, v5.4S, v12.4S // ..........................................................................................................................................................*................... - mls v0.4S, v15.4S, v8.S[0] // .............................................................................................................................*................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + sub v11.4S, v1.4S, v19.4S // ................................................................................................................................................*............................. + add v10.4S, v1.4S, v19.4S // .................................................................................................................................................*............................ + mls v22.4S, v14.4S, v8.S[0] // .............................................................................................................................*................................................ + mls v4.4S, v28.4S, v8.4S // ...........................................................................................................................................*.................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v9.4S, v27.4S, v8.4S // ...........................................................................................................................................*.................................. - mls v17.4S, v31.4S, v8.4S // ...............................................................................................................................................*.............................. - mls v13.4S, v2.4S, v8.S[0] // .......................................................................................................................................*...................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v18.4S, v12.4S, v8.4S // ...............................................................................................................................................*.............................. + mls v6.4S, v31.4S, v8.S[0] // .......................................................................................................................................*...................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - add v2.4S, v29.4S, v19.4S // .................................................................................................................................................*............................ - sub v29.4S, v29.4S, v19.4S // ................................................................................................................................................*............................. - str q18, [x1, #32] // ......................................................................................................................................................................*....... - mul v19.4S, v5.4S, v6.S[0] // ............................................................................................................................................................*................. - sqrdmulh v5.4S, v5.4S, v6.S[1] // .............................................................................................................................................................*................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mul v7.4S, v11.4S, v16.S[0] // ..................................................................................................................................................*........................... + sqrdmulh v11.4S, v11.4S, v16.S[1] // ...................................................................................................................................................*.......................... + str q10, [x1], #(16*4) // ....................................................................................................................................................................*......... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + sub v23.4S, v5.4S, v29.4S // ..........................................................................................................................................................*................... + add v31.4S, v5.4S, v29.4S // ...........................................................................................................................................................*.................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v12.4S, v0.4S, v13.4S // ...............................................................................................................................................................*.............. - add v0.4S, v0.4S, v13.4S // ................................................................................................................................................................*............. - sub v15.4S, v9.4S, v17.4S // .....................................................................................................................................................*........................ - add v9.4S, v9.4S, v17.4S // ......................................................................................................................................................*....................... - str q2, [x1], #(16*4) // ....................................................................................................................................................................*......... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v2.4S, v29.4S, v6.S[0] // ..................................................................................................................................................*........................... - sqrdmulh v29.4S, v29.4S, v6.S[1] // ...................................................................................................................................................*.......................... - mls v19.4S, v5.4S, v8.S[0] // ..............................................................................................................................................................*............... + sub v12.4S, v4.4S, v18.4S // .....................................................................................................................................................*........................ + sub v27.4S, v22.4S, v6.4S // ...............................................................................................................................................................*.............. + add v13.4S, v4.4S, v18.4S // ......................................................................................................................................................*....................... + add v10.4S, v22.4S, v6.4S // ................................................................................................................................................................*............. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v7.4S, v11.4S, v8.S[0] // ....................................................................................................................................................*......................... + str q31, [x1, #-32] // ......................................................................................................................................................................*....... + mul v31.4S, v23.4S, v16.S[0] // ............................................................................................................................................................*................. + sqrdmulh v3.4S, v23.4S, v16.S[1] // .............................................................................................................................................................*................ // gap // .............................................................................................................................................................................. - mul v5.4S, v12.4S, v6.S[0] // .................................................................................................................................................................*............ - sqrdmulh v12.4S, v12.4S, v6.S[1] // ..................................................................................................................................................................*........... - mul v13.4S, v15.4S, v6.S[0] // .......................................................................................................................................................*...................... - sqrdmulh v15.4S, v15.4S, v6.S[1] // ........................................................................................................................................................*..................... - str q0, [x1, #-16] // .......................................................................................................................................................................*...... - str q9, [x1, #-48] // .....................................................................................................................................................................*........ - add x1, x1, #64 // ............................................................................................................................................................................*. // gap // .............................................................................................................................................................................. - ldr q0, [x1, #32] // ..e........................................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mul v5.4S, v12.4S, v16.S[0] // .......................................................................................................................................................*...................... + sqrdmulh v12.4S, v12.4S, v16.S[1] // ........................................................................................................................................................*..................... + mul v28.4S, v27.4S, v16.S[0] // .................................................................................................................................................................*............ + sqrdmulh v27.4S, v27.4S, v16.S[1] // ..................................................................................................................................................................*........... + str q13, [x1, #-48] // .....................................................................................................................................................................*........ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + str q10, [x1, #-16] // .......................................................................................................................................................................*...... + add x1, x1, #64 // ............................................................................................................................................................................*. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v2.4S, v29.4S, v8.S[0] // ....................................................................................................................................................*......................... - str q19, [x2, #32] // ..........................................................................................................................................................................*... - ldr q18, [x1, #48] // ...e.......................................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + str q7, [x2], #(16*4) // ........................................................................................................................................................................*..... + mls v31.4S, v3.4S, v8.S[0] // ..............................................................................................................................................................*............... + ldr q29, [x1, #0] // e............................................................................................................................................................................. + ldr q7, [x1, #48] // ...e.......................................................................................................................................................................... // gap // .............................................................................................................................................................................. - mls v5.4S, v12.4S, v8.S[0] // ...................................................................................................................................................................*.......... - mls v13.4S, v15.4S, v8.S[0] // .........................................................................................................................................................*.................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v5.4S, v12.4S, v8.S[0] // .........................................................................................................................................................*.................... + mls v28.4S, v27.4S, v8.S[0] // ...................................................................................................................................................................*.......... + ldr q6, [x1, #16] // .e............................................................................................................................................................................ + ldr q19, [x1, #32] // ..e........................................................................................................................................................................... + ldr q11, [x5, #-96] // ..................................................e........................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. @@ -826,7 +872,7 @@ layer45678_start: // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - str q2, [x2], #(16*4) // ........................................................................................................................................................................*..... + str q31, [x2, #-32] // ..........................................................................................................................................................................*... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. @@ -834,8 +880,8 @@ layer45678_start: // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - str q5, [x2, #-16] // ...........................................................................................................................................................................*.. - str q13, [x2, #-48] // .........................................................................................................................................................................*.... + str q5, [x2, #-48] // .........................................................................................................................................................................*.... + str q28, [x2, #-16] // ...........................................................................................................................................................................*.. add x2, x2, #64 // .............................................................................................................................................................................* // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. @@ -843,790 +889,796 @@ layer45678_start: // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - // original source code - // ldr q9, [x1, #0] // ..........*............................................................................................................................................................................. - // ldr q10, [x1, #16] // ..........|*............................................................................................................................................................................ - // ldr q11, [x1, #32] // e.........|...................................................................................................................................................................e......... - // ldr q12, [x1, #48] // ...e......|......................................................................................................................................................................e...... - // trn1 v25.4s, v9.4s, v10.4s // ..........|.............*............................................................................................................................................................... - // trn2 v26.4s, v9.4s, v10.4s // ..........|..............*.............................................................................................................................................................. - // trn1 v27.4s, v11.4s, v12.4s // ..........|..*.......................................................................................................................................................................... - // trn2 v28.4s, v11.4s, v12.4s // ..........|...*......................................................................................................................................................................... - // trn2 v11.2d, v25.2d, v27.2d // ..........|.........................*................................................................................................................................................... - // trn2 v12.2d, v26.2d, v28.2d // ..........|...........................*................................................................................................................................................. - // trn1 v9.2d, v25.2d, v27.2d // ..........|..........................*.................................................................................................................................................. - // trn1 v10.2d, v26.2d, v28.2d // ..........|............................*................................................................................................................................................ - // ldr q13, [x2, #0] // ..........|.*........................................................................................................................................................................... - // ldr q14, [x2, #16] // ..........|....*........................................................................................................................................................................ - // ldr q15, [x2, #32] // ..........|.....*....................................................................................................................................................................... - // ldr q16, [x2, #48] // ..........|......*...................................................................................................................................................................... - // trn1 v25.4s, v13.4s, v14.4s // ..........|..................*.......................................................................................................................................................... - // trn2 v26.4s, v13.4s, v14.4s // ..........|...................*......................................................................................................................................................... - // trn1 v27.4s, v15.4s, v16.4s // ..........|....................*........................................................................................................................................................ - // trn2 v28.4s, v15.4s, v16.4s // ..........|.....................*....................................................................................................................................................... - // trn2 v15.2d, v25.2d, v27.2d // ..........|................................*............................................................................................................................................ - // trn2 v16.2d, v26.2d, v28.2d // ..........|..................................*.......................................................................................................................................... - // trn1 v13.2d, v25.2d, v27.2d // ..........|.................................*........................................................................................................................................... - // trn1 v14.2d, v26.2d, v28.2d // ..........|...................................*......................................................................................................................................... - // ldr q0, [x5], #(12*16) // ..........|........*.................................................................................................................................................................... - // ldr q4, [x5, #(-12*16 + 1*16)] // ..........|.........*................................................................................................................................................................... - // ldr q1, [x5, #(-12*16 + 2*16)] // ..........|.......*..................................................................................................................................................................... - // ldr q5, [x5, #(-12*16 + 3*16)] // ..........|..........*.................................................................................................................................................................. - // ldr q2, [x5, #(-12*16 + 4*16)] // ..........|...........*................................................................................................................................................................. - // ldr q6, [x5, #(-12*16 + 5*16)] // ..........|............*................................................................................................................................................................ - // sub v24.4s, v9.4s, v10.4s // ..........|.......................................*..................................................................................................................................... - // add v9.4s, v9.4s, v10.4s // ..........|........................................*.................................................................................................................................... - // mul v10.4s, v24.4s, v1.4s // ..........|.............................................*............................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // ..........|..............................................*.............................................................................................................................. - // mls v10.4s, v24.4s, v8.s[0] // ..........|.........................................................*................................................................................................................... - // sub v24.4s, v11.4s, v12.4s // ..........|.....................................*....................................................................................................................................... - // add v11.4s, v11.4s, v12.4s // ..........|......................................*...................................................................................................................................... - // mul v12.4s, v24.4s, v2.4s // ..........|...............................................*............................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v6.4s // ..........|................................................*............................................................................................................................ - // mls v12.4s, v24.4s, v8.s[0] // ..........|..........................................................*.................................................................................................................. - // sub v24.4s, v9.4s, v11.4s // ..........|.......................................................*..................................................................................................................... - // add v9.4s, v9.4s, v11.4s // ..........|........................................................*.................................................................................................................... - // mul v11.4s, v24.4s, v0.4s // ..........|.............................................................*............................................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ..........|..............................................................*.............................................................................................................. - // mls v11.4s, v24.4s, v8.s[0] // ..........|.....................................................................*....................................................................................................... - // sub v24.4s, v10.4s, v12.4s // ..........|.................................................................*........................................................................................................... - // add v10.4s, v10.4s, v12.4s // ..........|..................................................................*.......................................................................................................... - // mul v12.4s, v24.4s, v0.4s // ..........|......................................................................*...................................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ..........|.......................................................................*..................................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ..........|...............................................................................*............................................................................................. - // ldr q0, [x5, #(-12*16 + 6*16)] // ..........|................*............................................................................................................................................................ - // ldr q4, [x5, #(-12*16 + 7*16)] // ..........|.................*........................................................................................................................................................... - // ldr q1, [x5, #(-12*16 + 8*16)] // ..........|...............*............................................................................................................................................................. - // ldr q5, [x5, #(-12*16 + 9*16)] // ..........|......................*...................................................................................................................................................... - // ldr q2, [x5, #(-12*16 + 10*16)] // ..........|.......................*..................................................................................................................................................... - // ldr q6, [x5, #(-12*16 + 11*16)] // ..........|........................*.................................................................................................................................................... - // sub v24.4s, v13.4s, v14.4s // ..........|...........................................*................................................................................................................................. - // add v13.4s, v13.4s, v14.4s // ..........|............................................*................................................................................................................................ - // mul v14.4s, v24.4s, v1.4s // ..........|.................................................*........................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // ..........|..................................................*.......................................................................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ..........|...........................................................*................................................................................................................. - // sub v24.4s, v15.4s, v16.4s // ..........|.........................................*................................................................................................................................... - // add v15.4s, v15.4s, v16.4s // ..........|..........................................*.................................................................................................................................. - // mul v16.4s, v24.4s, v2.4s // ..........|...................................................*......................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ..........|....................................................*........................................................................................................................ - // mls v16.4s, v24.4s, v8.s[0] // ..........|............................................................*................................................................................................................ - // sub v24.4s, v13.4s, v15.4s // ..........|.....................................................*....................................................................................................................... - // add v13.4s, v13.4s, v15.4s // ..........|......................................................*...................................................................................................................... - // mul v15.4s, v24.4s, v0.4s // ..........|...............................................................*............................................................................................................. - // sqrdmulh v24.4s, v24.4s, v4.4s // ..........|................................................................*............................................................................................................ - // mls v15.4s, v24.4s, v8.s[0] // ..........|........................................................................*.................................................................................................... - // sub v24.4s, v14.4s, v16.4s // ..........|...................................................................*......................................................................................................... - // add v14.4s, v14.4s, v16.4s // ..........|....................................................................*........................................................................................................ - // mul v16.4s, v24.4s, v0.4s // ..........|..........................................................................*.................................................................................................. - // sqrdmulh v24.4s, v24.4s, v4.4s // ..........|...........................................................................*................................................................................................. - // mls v16.4s, v24.4s, v8.s[0] // ..........|................................................................................*............................................................................................ - // trn1 v25.4s, v9.4s, v10.4s // ..........|.........................................................................*................................................................................................... - // trn2 v26.4s, v9.4s, v10.4s // ..........|............................................................................*................................................................................................ - // trn1 v27.4s, v11.4s, v12.4s // ..........|.................................................................................*........................................................................................... - // trn2 v28.4s, v11.4s, v12.4s // ..........|..................................................................................*.......................................................................................... - // trn2 v11.2d, v25.2d, v27.2d // ..........|.....................................................................................*....................................................................................... - // trn2 v12.2d, v26.2d, v28.2d // ..........|.......................................................................................*..................................................................................... - // trn1 v9.2d, v25.2d, v27.2d // ..........|......................................................................................*...................................................................................... - // trn1 v10.2d, v26.2d, v28.2d // ..........|........................................................................................*.................................................................................... - // trn1 v25.4s, v13.4s, v14.4s // ..........|.............................................................................*............................................................................................... - // trn2 v26.4s, v13.4s, v14.4s // ..........|..............................................................................*.............................................................................................. - // trn1 v27.4s, v15.4s, v16.4s // ..........|...................................................................................*......................................................................................... - // trn2 v28.4s, v15.4s, v16.4s // ..........|....................................................................................*........................................................................................ - // trn2 v15.2d, v25.2d, v27.2d // ..........|.........................................................................................*................................................................................... - // trn2 v16.2d, v26.2d, v28.2d // ..........|...........................................................................................*................................................................................. - // trn1 v13.2d, v25.2d, v27.2d // ..........|..........................................................................................*.................................................................................. - // trn1 v14.2d, v26.2d, v28.2d // ..........|............................................................................................*................................................................................ - // ldr q0, [x4], #64 // ..........|...............................*............................................................................................................................................. - // ldr q1, [x4, #(-64 + 16)] // ..........|..............................*.............................................................................................................................................. - // ldr q2, [x4, #(-64 + 32)] // ..........|.............................*............................................................................................................................................... - // ldr q3, [x4, #(-64 + 48)] // ..........|....................................*........................................................................................................................................ - // sub v24.4s, v9.4s, v10.4s // ..........|...............................................................................................*............................................................................. - // add v9.4s, v9.4s, v10.4s // ..........|................................................................................................*............................................................................ - // mul v10.4s, v24.4s, v1.s[2] // ..........|.......................................................................................................*..................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..........|........................................................................................................*.................................................................... - // mls v10.4s, v24.4s, v8.s[0] // ..........|.................................................................................................................*........................................................... - // sub v24.4s, v11.4s, v12.4s // ..........|.............................................................................................*............................................................................... - // add v11.4s, v11.4s, v12.4s // ..........|..............................................................................................*.............................................................................. - // mul v12.4s, v24.4s, v2.s[0] // ..........|.....................................................................................................*....................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........|......................................................................................................*...................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ..........|..................................................................................................................*.......................................................... - // sub v24.4s, v13.4s, v14.4s // ..........|...................................................................................................*......................................................................... - // add v13.4s, v13.4s, v14.4s // ..........|....................................................................................................*........................................................................ - // mul v14.4s, v24.4s, v2.s[2] // ..........|.........................................................................................................*................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..........|..........................................................................................................*.................................................................. - // mls v14.4s, v24.4s, v8.s[0] // ..........|...................................................................................................................*......................................................... - // sub v24.4s, v15.4s, v16.4s // ..........|.................................................................................................*........................................................................... - // add v15.4s, v15.4s, v16.4s // ..........|..................................................................................................*.......................................................................... - // mul v16.4s, v24.4s, v3.s[0] // ..........|...........................................................................................................*................................................................. - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ..........|............................................................................................................*................................................................ - // mls v16.4s, v24.4s, v8.s[0] // ..........|....................................................................................................................*........................................................ - // sub v24.4s, v9.4s, v11.4s // ..........|.............................................................................................................*............................................................... - // add v9.4s, v9.4s, v11.4s // ..........|..............................................................................................................*.............................................................. - // mul v11.4s, v24.4s, v0.s[2] // ..........|.....................................................................................................................*....................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..........|......................................................................................................................*...................................................... - // mls v11.4s, v24.4s, v8.s[0] // ..........|...............................................................................................................................*............................................. - // sub v24.4s, v10.4s, v12.4s // ..........|...........................................................................................................................*................................................. - // add v10.4s, v10.4s, v12.4s // ..........|............................................................................................................................*................................................ - // mul v12.4s, v24.4s, v0.s[2] // ..........|..................................................................................................................................*.......................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..........|...................................................................................................................................*......................................... - // mls v12.4s, v24.4s, v8.s[0] // ..........|...........................................................................................................................................*................................. - // sub v24.4s, v13.4s, v15.4s // ..........|...............................................................................................................*............................................................. - // add v13.4s, v13.4s, v15.4s // ..........|................................................................................................................*............................................................ - // mul v15.4s, v24.4s, v1.s[0] // ..........|.......................................................................................................................*..................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........|........................................................................................................................*.................................................... - // mls v15.4s, v24.4s, v8.s[0] // ..........|.................................................................................................................................*........................................... - // sub v24.4s, v14.4s, v16.4s // ..........|.............................................................................................................................*............................................... - // add v14.4s, v14.4s, v16.4s // ..........|..............................................................................................................................*.............................................. - // mul v16.4s, v24.4s, v1.s[0] // ..........|....................................................................................................................................*........................................ - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........|.....................................................................................................................................*....................................... - // mls v16.4s, v24.4s, v8.s[0] // ..........|..............................................................................................................................................*.............................. - // srshr v24.4S, v9.4S, #23 // ..........|.........................................................................................................................*................................................... - // mls v9.4s, v24.4s, v8.4s // ..........|................................................................................................................................*............................................ - // srshr v24.4S, v10.4S, #23 // ..........|......................................................................................................................................*...................................... - // mls v10.4s, v24.4s, v8.4s // ..........|............................................................................................................................................*................................ - // srshr v24.4S, v13.4S, #23 // ..........|..........................................................................................................................*.................................................. - // mls v13.4s, v24.4s, v8.4s // ..........|........................................................................................................................................*.................................... - // srshr v24.4S, v14.4S, #23 // ..........|.......................................................................................................................................*..................................... - // mls v14.4s, v24.4s, v8.4s // ..........|.............................................................................................................................................*............................... - // sub v24.4s, v9.4s, v13.4s // ..........|................................................................................................................................................*............................ - // add v9.4s, v9.4s, v13.4s // ..........|...............................................................................................................................................*............................. - // mul v13.4s, v24.4s, v0.s[0] // ..........|.........................................................................................................................................................*................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........|..........................................................................................................................................................*.................. - // mls v13.4s, v24.4s, v8.s[0] // .*........|....................................................................................................................................................................*........ - // sub v24.4s, v10.4s, v14.4s // ..........|......................................................................................................................................................*...................... - // add v10.4s, v10.4s, v14.4s // ..........|.......................................................................................................................................................*..................... - // mul v14.4s, v24.4s, v0.s[0] // ..........|..............................................................................................................................................................*.............. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........|...............................................................................................................................................................*............. - // mls v14.4s, v24.4s, v8.s[0] // .....*....|........................................................................................................................................................................*.... - // sub v24.4s, v11.4s, v15.4s // ..........|..........................................................................................................................................*.................................. - // add v11.4s, v11.4s, v15.4s // ..........|.........................................................................................................................................*................................... - // mul v15.4s, v24.4s, v0.s[0] // ..........|..................................................................................................................................................*.......................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........|...................................................................................................................................................*......................... - // mls v15.4s, v24.4s, v8.s[0] // ..........|...........................................................................................................................................................*................. - // sub v24.4s, v12.4s, v16.4s // ..........|....................................................................................................................................................*........................ - // add v12.4s, v12.4s, v16.4s // ..........|.....................................................................................................................................................*....................... - // mul v16.4s, v24.4s, v0.s[0] // ..........|............................................................................................................................................................*................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........|.............................................................................................................................................................*............... - // mls v16.4s, v24.4s, v8.s[0] // ....*.....|.......................................................................................................................................................................*..... - // str q9, [x1], #(16*4) // ..........|........................................................................................................................................................*.................... - // str q10, [x1, #(-16*4 + 1*16)] // ..........|.................................................................................................................................................................*........... - // str q11, [x1, #(-16*4 + 2*16)] // ..........|.................................................................................................................................................*........................... - // str q12, [x1, #(-16*4 + 3*16)] // ..........|................................................................................................................................................................*............ - // str q13, [x2], #(16*4) // ......*...|.........................................................................................................................................................................*... - // str q14, [x2, #(-16*4 + 1*16)] // ........*.|...........................................................................................................................................................................*. - // str q15, [x2, #(-16*4 + 2*16)] // ..*.......|.....................................................................................................................................................................*....... - // str q16, [x2, #(-16*4 + 3*16)] // .......*..|..........................................................................................................................................................................*.. - // add x1, x1, #64 // ..........|..................................................................................................................................................................*.......... - // add x2, x2, #64 // .........*|............................................................................................................................................................................* + // ---------------------------------------------------------------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------ + // ldr q9, [x1, #0] // ...................................................................................................................e..........'..................................................................................................................................................................~.......... + // ldr q10, [x1, #16] // .......................................................................................................................e......'......................................................................................................................................................................~...... + // ldr q11, [x1, #32] // ........................................................................................................................e.....'.......................................................................................................................................................................~..... + // ldr q12, [x1, #48] // ....................................................................................................................e.........'...................................................................................................................................................................~......... + // trn1 v25.4s, v9.4s, v10.4s // ..............................................................................................................................'....*........................................................................................................................................................................ + // trn2 v26.4s, v9.4s, v10.4s // ..............................................................................................................................'*............................................................................................................................................................................ + // trn1 v27.4s, v11.4s, v12.4s // ..............................................................................................................................'.....*....................................................................................................................................................................... + // trn2 v28.4s, v11.4s, v12.4s // ..............................................................................................................................'.*........................................................................................................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // ..............................................................................................................................'.........*................................................................................................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // ..............................................................................................................................'...........*................................................................................................................................................................. + // trn1 v9.2d, v25.2d, v27.2d // ..............................................................................................................................'........*.................................................................................................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ..............................................................................................................................'..........*.................................................................................................................................................................. + // ldr q13, [x2, #0] // ..............................................................................................................................'...*......................................................................................................................................................................... + // ldr q14, [x2, #16] // ..............................................................................................................................'..*.......................................................................................................................................................................... + // ldr q15, [x2, #32] // ..............................................................................................................................*............................................................................................................................................................................. + // ldr q16, [x2, #48] // ..............................................................................................................................'......*...................................................................................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ..............................................................................................................................'...................*......................................................................................................................................................... + // trn2 v26.4s, v13.4s, v14.4s // ..............................................................................................................................'.............*............................................................................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ..............................................................................................................................'.................*........................................................................................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // ..............................................................................................................................'..................*.......................................................................................................................................................... + // trn2 v15.2d, v25.2d, v27.2d // ..............................................................................................................................'.............................*............................................................................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ..............................................................................................................................'...........................*................................................................................................................................................. + // trn1 v13.2d, v25.2d, v27.2d // ..............................................................................................................................'..............................*.............................................................................................................................................. + // trn1 v14.2d, v26.2d, v28.2d // ..............................................................................................................................'............................*................................................................................................................................................ + // ldr q0, [x5], #(12*16) // .........e....................................................................................................................'........................................................~.................................................................................................................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ..............................................................................................................................'................................*............................................................................................................................................ + // ldr q1, [x5, #(-12*16 + 2*16)] // e.............................................................................................................................'...............................................~............................................................................................................................. + // ldr q5, [x5, #(-12*16 + 3*16)] // ...................................................................e..........................................................'..................................................................................................................~.......................................................... + // ldr q2, [x5, #(-12*16 + 4*16)] // .............................................................................e................................................'............................................................................................................................~................................................ + // ldr q6, [x5, #(-12*16 + 5*16)] // ..........................................................................e...................................................'.........................................................................................................................~................................................... + // sub v24.4s, v9.4s, v10.4s // ..............................................................................................................................'...............*............................................................................................................................................................. + // add v9.4s, v9.4s, v10.4s // ..............................................................................................................................'................*............................................................................................................................................................ + // mul v10.4s, v24.4s, v1.4s // ..............................................................................................................................'......................*...................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ..............................................................................................................................'.......................*..................................................................................................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ..............................................................................................................................'......................................*...................................................................................................................................... + // sub v24.4s, v11.4s, v12.4s // ..............................................................................................................................'..............*.............................................................................................................................................................. + // add v11.4s, v11.4s, v12.4s // ..............................................................................................................................'....................*........................................................................................................................................................ + // mul v12.4s, v24.4s, v2.4s // ..............................................................................................................................'.........................*................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ..............................................................................................................................'........................*.................................................................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ..............................................................................................................................'..................................*.......................................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ..............................................................................................................................'.................................*........................................................................................................................................... + // add v9.4s, v9.4s, v11.4s // ..........~...................................................................................................................'.........................................................*................................................................................................................... + // mul v11.4s, v24.4s, v0.4s // ..~...........................................................................................................................'.................................................*........................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...~..........................................................................................................................'..................................................*.......................................................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ..............~...............................................................................................................'.............................................................*............................................................................................................... + // sub v24.4s, v10.4s, v12.4s // ..............................................................................................................................'..............................................*.............................................................................................................................. + // add v10.4s, v10.4s, v12.4s // ..............................................................................................................................'.............................................*............................................................................................................................... + // mul v12.4s, v24.4s, v0.4s // ........~.....................................................................................................................'.......................................................*..................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ......~.......................................................................................................................'.....................................................*....................................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ................~.............................................................................................................'...............................................................*............................................................................................................. + // ldr q0, [x5, #(-12*16 + 6*16)] // .........................................................................................................................e....'........................................................................................................................................................................~.... + // ldr q4, [x5, #(-12*16 + 7*16)] // ..............................................................................................................................'.....................*....................................................................................................................................................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ..............................................................................................................................'.......*..................................................................................................................................................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // ..............................................................................................................................'..........................*.................................................................................................................................................. + // ldr q2, [x5, #(-12*16 + 10*16)] // ..............................................................................................................................'...............................*............................................................................................................................................. + // ldr q6, [x5, #(-12*16 + 11*16)] // ..............................................................................................................................'............*................................................................................................................................................................ + // sub v24.4s, v13.4s, v14.4s // ..............................................................................................................................'....................................*........................................................................................................................................ + // add v13.4s, v13.4s, v14.4s // ..............................................................................................................................'.......................................*..................................................................................................................................... + // mul v14.4s, v24.4s, v1.4s // ..............................................................................................................................'..........................................*.................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v5.4s // ..............................................................................................................................'.........................................*................................................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // .......~......................................................................................................................'......................................................*...................................................................................................................... + // sub v24.4s, v15.4s, v16.4s // ..............................................................................................................................'.....................................*....................................................................................................................................... + // add v15.4s, v15.4s, v16.4s // ..............................................................................................................................'........................................*.................................................................................................................................... + // mul v16.4s, v24.4s, v2.4s // ..............................................................................................................................'...........................................*................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v6.4s // ..............................................................................................................................'............................................*................................................................................................................................ + // mls v16.4s, v24.4s, v8.s[0] // ....~.........................................................................................................................'...................................................*......................................................................................................................... + // sub v24.4s, v13.4s, v15.4s // .~............................................................................................................................'................................................*............................................................................................................................ + // add v13.4s, v13.4s, v15.4s // ...............~..............................................................................................................'..............................................................*.............................................................................................................. + // mul v15.4s, v24.4s, v0.4s // ............~.................................................................................................................'...........................................................*................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v4.4s // .............~................................................................................................................'............................................................*................................................................................................................ + // mls v15.4s, v24.4s, v8.s[0] // ...................~..........................................................................................................'..................................................................*.......................................................................................................... + // sub v24.4s, v14.4s, v16.4s // .................~............................................................................................................'................................................................*............................................................................................................ + // add v14.4s, v14.4s, v16.4s // ..................~...........................................................................................................'.................................................................*........................................................................................................... + // mul v16.4s, v24.4s, v0.4s // ........................~.....................................................................................................'.......................................................................*..................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // .......................~......................................................................................................'......................................................................*...................................................................................................... + // mls v16.4s, v24.4s, v8.s[0] // .............................~................................................................................................'............................................................................*................................................................................................ + // trn1 v25.4s, v9.4s, v10.4s // ...........................~..................................................................................................'..........................................................................*.................................................................................................. + // trn2 v26.4s, v9.4s, v10.4s // ....................~.........................................................................................................'...................................................................*......................................................................................................... + // trn1 v27.4s, v11.4s, v12.4s // ..........................~...................................................................................................'.........................................................................*................................................................................................... + // trn2 v28.4s, v11.4s, v12.4s // .........................~....................................................................................................'........................................................................*.................................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // ..............................~...............................................................................................'.............................................................................*............................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // ................................~.............................................................................................'...............................................................................*............................................................................................. + // trn1 v9.2d, v25.2d, v27.2d // ...............................~..............................................................................................'..............................................................................*.............................................................................................. + // trn1 v10.2d, v26.2d, v28.2d // .................................~............................................................................................'................................................................................*............................................................................................ + // trn1 v25.4s, v13.4s, v14.4s // .....................~........................................................................................................'....................................................................*........................................................................................................ + // trn2 v26.4s, v13.4s, v14.4s // ......................~.......................................................................................................'.....................................................................*....................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ....................................~.........................................................................................'...................................................................................*......................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // .....................................~........................................................................................'....................................................................................*........................................................................................ + // trn2 v15.2d, v25.2d, v27.2d // ........................................~.....................................................................................'.......................................................................................*..................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ..........................................~...................................................................................'.........................................................................................*................................................................................... + // trn1 v13.2d, v25.2d, v27.2d // ...........................................~..................................................................................'..........................................................................................*.................................................................................. + // trn1 v14.2d, v26.2d, v28.2d // .........................................~....................................................................................'........................................................................................*.................................................................................... + // ldr q0, [x4], #64 // ...........~..................................................................................................................'..........................................................*.................................................................................................................. + // ldr q1, [x4, #(-64 + 16)] // ..............................................................................................................................'...................................*......................................................................................................................................... + // ldr q2, [x4, #(-64 + 32)] // .....~........................................................................................................................'....................................................*........................................................................................................................ + // ldr q3, [x4, #(-64 + 48)] // ............................~.................................................................................................'...........................................................................*................................................................................................. + // sub v24.4s, v9.4s, v10.4s // ............................................~.................................................................................'...........................................................................................*................................................................................. + // add v9.4s, v9.4s, v10.4s // .......................................~......................................................................................'......................................................................................*...................................................................................... + // mul v10.4s, v24.4s, v1.s[2] // ...................................................~..........................................................................'..................................................................................................*.......................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ....................................................~.........................................................................'...................................................................................................*......................................................................... + // mls v10.4s, v24.4s, v8.s[0] // .............................................................~................................................................'............................................................................................................*................................................................ + // sub v24.4s, v11.4s, v12.4s // ..................................~...........................................................................................'.................................................................................*........................................................................................... + // add v11.4s, v11.4s, v12.4s // ...................................~..........................................................................................'..................................................................................*.......................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ......................................~.......................................................................................'.....................................................................................*....................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .............................................~................................................................................'............................................................................................*................................................................................ + // mls v12.4s, v24.4s, v8.s[0] // ..........................................................~...................................................................'.........................................................................................................*................................................................... + // sub v24.4s, v13.4s, v14.4s // .................................................~............................................................................'................................................................................................*............................................................................ + // add v13.4s, v13.4s, v14.4s // ..................................................~...........................................................................'.................................................................................................*........................................................................... + // mul v14.4s, v24.4s, v2.s[2] // ......................................................~.......................................................................'.....................................................................................................*....................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // .......................................................~......................................................................'......................................................................................................*...................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ................................................................~.............................................................'...............................................................................................................*............................................................. + // sub v24.4s, v15.4s, v16.4s // ...............................................~..............................................................................'..............................................................................................*.............................................................................. + // add v15.4s, v15.4s, v16.4s // ................................................~.............................................................................'...............................................................................................*............................................................................. + // mul v16.4s, v24.4s, v3.s[0] // ........................................................~.....................................................................'.......................................................................................................*..................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .........................................................~....................................................................'........................................................................................................*.................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ...............................................................~..............................................................'..............................................................................................................*.............................................................. + // sub v24.4s, v9.4s, v11.4s // .................................................................~............................................................'................................................................................................................*............................................................ + // add v9.4s, v9.4s, v11.4s // ..............................................~...............................................................................'.............................................................................................*............................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ........................................................................~.....................................................'.......................................................................................................................*..................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................................~....................................................'........................................................................................................................*.................................................... + // mls v11.4s, v24.4s, v8.s[0] // ....................................................................................~.........................................'...................................................................................................................................*......................................... + // sub v24.4s, v10.4s, v12.4s // ......................................................................~.......................................................'.....................................................................................................................*....................................................... + // add v10.4s, v10.4s, v12.4s // .......................................................................~......................................................'......................................................................................................................*...................................................... + // mul v12.4s, v24.4s, v0.s[2] // ...............................................................................~..............................................'..............................................................................................................................*.............................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................................................................................~.............................................'...............................................................................................................................*............................................. + // mls v12.4s, v24.4s, v8.s[0] // .........................................................................................~....................................'........................................................................................................................................*.................................... + // sub v24.4s, v13.4s, v15.4s // ...........................................................~..................................................................'..........................................................................................................*.................................................................. + // add v13.4s, v13.4s, v15.4s // ..............................................................~...............................................................'.............................................................................................................*............................................................... + // mul v15.4s, v24.4s, v1.s[0] // .....................................................................~........................................................'....................................................................................................................*........................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..................................................................~...........................................................'.................................................................................................................*........................................................... + // mls v15.4s, v24.4s, v8.s[0] // ......................................................................................~.......................................'.....................................................................................................................................*....................................... + // sub v24.4s, v14.4s, v16.4s // ...........................................................................~..................................................'..........................................................................................................................*.................................................. + // add v14.4s, v14.4s, v16.4s // ............................................................................~.................................................'...........................................................................................................................*................................................. + // mul v16.4s, v24.4s, v1.s[0] // ..................................................................................~...........................................'.................................................................................................................................*........................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................~..........................................'..................................................................................................................................*.......................................... + // mls v16.4s, v24.4s, v8.s[0] // ............................................................................................~.................................'...........................................................................................................................................*................................. + // srshr v24.4S, v9.4S, #23 // .....................................................~........................................................................'....................................................................................................*........................................................................ + // mls v9.4s, v24.4s, v8.4s // ............................................................~.................................................................'...........................................................................................................*................................................................. + // srshr v24.4S, v10.4S, #23 // .................................................................................~............................................'................................................................................................................................*............................................ + // mls v10.4s, v24.4s, v8.4s // ..........................................................................................~...................................'.........................................................................................................................................*................................... + // srshr v24.4S, v13.4S, #23 // ....................................................................~.........................................................'...................................................................................................................*......................................................... + // mls v13.4s, v24.4s, v8.4s // ..............................................................................~...............................................'.............................................................................................................................*............................................... + // srshr v24.4S, v14.4S, #23 // .....................................................................................~........................................'....................................................................................................................................*........................................ + // mls v14.4s, v24.4s, v8.4s // ...........................................................................................~..................................'..........................................................................................................................................*.................................. + // sub v24.4s, v9.4s, v13.4s // .......................................................................................~......................................'......................................................................................................................................*...................................... + // add v9.4s, v9.4s, v13.4s // ........................................................................................~.....................................'.......................................................................................................................................*..................................... + // mul v13.4s, v24.4s, v0.s[0] // .............................................................................................~................................'............................................................................................................................................*................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................................................................~...............................'.............................................................................................................................................*............................... + // mls v13.4s, v24.4s, v8.s[0] // ......................................................................................................~.......................'.....................................................................................................................................................*....................... + // sub v24.4s, v10.4s, v14.4s // ..................................................................................................~...........................'.................................................................................................................................................*........................... + // add v10.4s, v10.4s, v14.4s // ....................................................................................................~.........................'...................................................................................................................................................*......................... + // mul v14.4s, v24.4s, v0.s[0] // ..........................................................................................................~...................'.........................................................................................................................................................*................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................................................................~..................'..........................................................................................................................................................*.................. + // mls v14.4s, v24.4s, v8.s[0] // .....................................................................................................................~........'....................................................................................................................................................................*........ + // sub v24.4s, v11.4s, v15.4s // ................................................................................................~.............................'...............................................................................................................................................*............................. + // add v11.4s, v11.4s, v15.4s // .................................................................................................~............................'................................................................................................................................................*............................ + // mul v15.4s, v24.4s, v0.s[0] // ........................................................................................................~.....................'.......................................................................................................................................................*..................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................................................................................................~....................'........................................................................................................................................................*.................... + // mls v15.4s, v24.4s, v8.s[0] // ..................................................................................................................~...........'.................................................................................................................................................................*........... + // sub v24.4s, v12.4s, v16.4s // ...................................................................................................~..........................'..................................................................................................................................................*.......................... + // add v12.4s, v12.4s, v16.4s // .....................................................................................................~........................'....................................................................................................................................................*........................ + // mul v16.4s, v24.4s, v0.s[0] // ............................................................................................................~.................'...........................................................................................................................................................*................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................................................................~................'............................................................................................................................................................*................ + // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................................................~.......'.....................................................................................................................................................................*....... + // str q9, [x1], #(16*4) // ...............................................................................................~..............................'..............................................................................................................................................*.............................. + // str q10, [x1, #(-16*4 + 1*16)] // ..............................................................................................................~...............'.............................................................................................................................................................*............... + // str q11, [x1, #(-16*4 + 2*16)] // .......................................................................................................~......................'......................................................................................................................................................*...................... + // str q12, [x1, #(-16*4 + 3*16)] // ...............................................................................................................~..............'..............................................................................................................................................................*.............. + // str q13, [x2], #(16*4) // .................................................................................................................~............'................................................................................................................................................................*............ + // str q14, [x2, #(-16*4 + 1*16)] // ...........................................................................................................................~..'..........................................................................................................................................................................*.. + // str q15, [x2, #(-16*4 + 2*16)] // ..........................................................................................................................~...'.........................................................................................................................................................................*... + // str q16, [x2, #(-16*4 + 3*16)] // ............................................................................................................................~.'...........................................................................................................................................................................*. + // add x1, x1, #64 // ................................................................................................................~.............'...............................................................................................................................................................*............. + // add x2, x2, #64 // .............................................................................................................................~'............................................................................................................................................................................* sub count, count, #1 cbnz count, layer45678_start - ldr q12, [x2, #32] // ......*..................................................................................................................................................................... - trn2 v9.4S, v0.4S, v18.4S // ....*....................................................................................................................................................................... - ldr q29, [x1, #0] // *........................................................................................................................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - ldr q5, [x1, #16] // .*.......................................................................................................................................................................... - // gap // ............................................................................................................................................................................ - ldr q15, [x2, #48] // .......*.................................................................................................................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - ldr q19, [x2, #0] // ..*......................................................................................................................................................................... - ldr q2, [x2, #16] // .....*...................................................................................................................................................................... - ldr q28, [x5, #160] // ........................*................................................................................................................................................... - ldr q13, [x5], #(12*16) // .........*.................................................................................................................................................................. - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - trn1 v17.4S, v29.4S, v5.4S // ..............*............................................................................................................................................................. - trn2 v14.4S, v29.4S, v5.4S // ...............*............................................................................................................................................................ - ldr q29, [x5, #-144] // ...........*................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - trn1 v5.4S, v0.4S, v18.4S // ...*........................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - trn2 v0.4S, v19.4S, v2.4S // ....................*....................................................................................................................................................... - trn1 v10.4S, v12.4S, v15.4S // .....................*...................................................................................................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - trn2 v27.4S, v12.4S, v15.4S // ......................*..................................................................................................................................................... - trn1 v12.4S, v19.4S, v2.4S // ...................*........................................................................................................................................................ - ldr q19, [x5, #-160] // ........*................................................................................................................................................................... - trn2 v1.2D, v17.2D, v5.2D // ..........................*................................................................................................................................................. - trn1 v18.2D, v17.2D, v5.2D // ...........................*................................................................................................................................................ - trn2 v31.2D, v14.2D, v9.2D // ............................*............................................................................................................................................... - trn1 v25.2D, v14.2D, v9.2D // .............................*.............................................................................................................................................. - ldr q9, [x5, #-128] // ............*............................................................................................................................................................... - ldr q15, [x5, #-112] // .............*.............................................................................................................................................................. - ldr q5, [x5, #-64] // ................*........................................................................................................................................................... - // gap // ............................................................................................................................................................................ - trn1 v4.2D, v0.2D, v27.2D // ....................................*....................................................................................................................................... - trn2 v17.2D, v0.2D, v27.2D // ...................................*........................................................................................................................................ - trn1 v27.2D, v12.2D, v10.2D // ..................................*......................................................................................................................................... - trn2 v12.2D, v12.2D, v10.2D // .................................*.......................................................................................................................................... - ldr q2, [x5, #-48] // .......................*.................................................................................................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - ldr q0, [x5, #-16] // .........................*.................................................................................................................................................. - sub v3.4S, v1.4S, v31.4S // ......................................*..................................................................................................................................... - add v31.4S, v1.4S, v31.4S // .......................................*.................................................................................................................................... - sub v22.4S, v18.4S, v25.4S // ........................................*................................................................................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - sub v16.4S, v27.4S, v4.4S // ............................................*............................................................................................................................... - sub v1.4S, v12.4S, v17.4S // ..........................................*................................................................................................................................. - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mul v14.4S, v22.4S, v19.4S // ..............................................*............................................................................................................................. - ldr q19, [x5, #-80] // ..................*......................................................................................................................................................... - sqrdmulh v29.4S, v22.4S, v29.4S // ...............................................*............................................................................................................................ - mul v20.4S, v3.4S, v9.4S // ................................................*........................................................................................................................... - sqrdmulh v3.4S, v3.4S, v15.4S // .................................................*.......................................................................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mul v22.4S, v1.4S, v28.4S // ....................................................*....................................................................................................................... - sqrdmulh v26.4S, v16.4S, v2.4S // ...................................................*........................................................................................................................ - mul v9.4S, v16.4S, v5.4S // ..................................................*......................................................................................................................... - sqrdmulh v28.4S, v1.4S, v0.4S // .....................................................*...................................................................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - add v16.4S, v18.4S, v25.4S // .........................................*.................................................................................................................................. - add v30.4S, v12.4S, v17.4S // ...........................................*................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mls v14.4S, v29.4S, v8.S[0] // ..........................................................*................................................................................................................. - ldr q29, [x5, #-176] // ..........*................................................................................................................................................................. - mls v20.4S, v3.4S, v8.S[0] // ...........................................................*................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mls v22.4S, v28.4S, v8.S[0] // .............................................................*.............................................................................................................. - mls v9.4S, v26.4S, v8.S[0] // ............................................................*............................................................................................................... - sub v18.4S, v16.4S, v31.4S // ........................................................*................................................................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - add v10.4S, v27.4S, v4.4S // .............................................*.............................................................................................................................. - ldr q0, [x5, #-96] // .................*.......................................................................................................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - add v5.4S, v14.4S, v20.4S // ...................................................................*........................................................................................................ - sub v25.4S, v14.4S, v20.4S // ..................................................................*......................................................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mul v3.4S, v18.4S, v13.4S // ..............................................................*............................................................................................................. - sub v17.4S, v10.4S, v30.4S // ......................................................*..................................................................................................................... - sub v27.4S, v9.4S, v22.4S // ....................................................................*....................................................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mul v28.4S, v25.4S, v13.4S // .......................................................................*.................................................................................................... - sqrdmulh v18.4S, v18.4S, v29.4S // ...............................................................*............................................................................................................ - sqrdmulh v25.4S, v25.4S, v29.4S // ........................................................................*................................................................................................... - ldr q29, [x4, #16] // ...............................*............................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - sqrdmulh v12.4S, v17.4S, v19.4S // .................................................................*.......................................................................................................... - sqrdmulh v13.4S, v27.4S, v19.4S // ............................................................................*............................................................................................... - mul v2.4S, v17.4S, v0.4S // ................................................................*........................................................................................................... - mul v27.4S, v27.4S, v0.4S // ...........................................................................*................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - add v14.4S, v10.4S, v30.4S // .......................................................*.................................................................................................................... - add v19.4S, v16.4S, v31.4S // .........................................................*.................................................................................................................. - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mls v28.4S, v25.4S, v8.S[0] // ................................................................................*........................................................................................... - mls v3.4S, v18.4S, v8.S[0] // ......................................................................*..................................................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mls v2.4S, v12.4S, v8.S[0] // .........................................................................*.................................................................................................. - add v12.4S, v9.4S, v22.4S // .....................................................................*...................................................................................................... - mls v27.4S, v13.4S, v8.S[0] // .................................................................................*.......................................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - trn1 v7.4S, v19.4S, v5.4S // ..........................................................................*................................................................................................. - trn2 v9.4S, v19.4S, v5.4S // .............................................................................*.............................................................................................. - ldr q19, [x4, #32] // ..............................*............................................................................................................................................. - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - trn2 v18.4S, v3.4S, v28.4S // ...................................................................................*........................................................................................ - trn1 v15.4S, v3.4S, v28.4S // ..................................................................................*......................................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - trn1 v5.4S, v14.4S, v12.4S // ..............................................................................*............................................................................................. - trn2 v0.4S, v14.4S, v12.4S // ...............................................................................*............................................................................................ - trn2 v17.4S, v2.4S, v27.4S // .....................................................................................*...................................................................................... - trn1 v2.4S, v2.4S, v27.4S // ....................................................................................*....................................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - trn1 v13.2D, v9.2D, v18.2D // .........................................................................................*.................................................................................. - trn2 v18.2D, v9.2D, v18.2D // ........................................................................................*................................................................................... - trn2 v9.2D, v7.2D, v15.2D // ......................................................................................*..................................................................................... - trn1 v15.2D, v7.2D, v15.2D // .......................................................................................*.................................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - trn2 v12.2D, v0.2D, v17.2D // ............................................................................................*............................................................................... - trn1 v17.2D, v0.2D, v17.2D // .............................................................................................*.............................................................................. - trn1 v0.2D, v5.2D, v2.2D // ...........................................................................................*................................................................................ - trn2 v31.2D, v5.2D, v2.2D // ..........................................................................................*................................................................................. - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - ldr q5, [x4, #48] // .....................................*...................................................................................................................................... - sub v25.4S, v9.4S, v18.4S // ..............................................................................................*............................................................................. - add v18.4S, v9.4S, v18.4S // ...............................................................................................*............................................................................ - sub v9.4S, v15.4S, v13.4S // ................................................................................................*........................................................................... - add v2.4S, v15.4S, v13.4S // .................................................................................................*.......................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - sub v15.4S, v31.4S, v12.4S // ..................................................................................................*......................................................................... - sub v13.4S, v0.4S, v17.4S // ....................................................................................................*....................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - sqrdmulh v28.4S, v25.4S, v19.S[1] // .......................................................................................................*.................................................................... - mul v25.4S, v25.4S, v19.S[0] // ......................................................................................................*..................................................................... - sqrdmulh v3.4S, v9.4S, v29.S[3] // .........................................................................................................*.................................................................. - mul v9.4S, v9.4S, v29.S[2] // ........................................................................................................*................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mul v20.4S, v15.4S, v5.S[0] // ............................................................................................................*............................................................... - sqrdmulh v27.4S, v15.4S, v5.S[1] // .............................................................................................................*.............................................................. - mul v15.4S, v13.4S, v19.S[2] // ..........................................................................................................*................................................................. - sqrdmulh v22.4S, v13.4S, v19.S[3] // ...........................................................................................................*................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - ldr q5, [x4], #64 // ................................*........................................................................................................................................... - add v19.4S, v31.4S, v12.4S // ...................................................................................................*........................................................................ - add v13.4S, v0.4S, v17.4S // .....................................................................................................*...................................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mls v9.4S, v3.4S, v8.S[0] // ..................................................................................................................*......................................................... - mls v25.4S, v28.4S, v8.S[0] // ...................................................................................................................*........................................................ - sub v31.4S, v2.4S, v18.4S // ..............................................................................................................*............................................................. - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mls v15.4S, v22.4S, v8.S[0] // ....................................................................................................................*....................................................... - sub v0.4S, v13.4S, v19.4S // ................................................................................................................*........................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mls v20.4S, v27.4S, v8.S[0] // .....................................................................................................................*...................................................... - add v17.4S, v13.4S, v19.4S // .................................................................................................................*.......................................................... - sqrdmulh v13.4S, v31.4S, v5.S[3] // .......................................................................................................................*.................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - add v2.4S, v2.4S, v18.4S // ...............................................................................................................*............................................................ - sub v18.4S, v9.4S, v25.4S // ............................................................................................................................*............................................... - add v12.4S, v9.4S, v25.4S // .............................................................................................................................*.............................................. - mul v9.4S, v31.4S, v5.S[2] // ......................................................................................................................*..................................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - sub v14.4S, v15.4S, v20.4S // ..............................................................................................................................*............................................. - add v28.4S, v15.4S, v20.4S // ...............................................................................................................................*............................................ - mul v15.4S, v0.4S, v29.S[0] // ........................................................................................................................*................................................... - sqrdmulh v0.4S, v0.4S, v29.S[1] // .........................................................................................................................*.................................................. - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - srshr v27.4S, v17.4S, #23 // ...........................................................................................................................*................................................ - srshr v25.4S, v12.4S, #23 // .......................................................................................................................................*.................................... - srshr v31.4S, v2.4S, #23 // ..........................................................................................................................*................................................. - sqrdmulh v22.4S, v18.4S, v5.S[3] // ....................................................................................................................................*....................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mul v19.4S, v18.4S, v5.S[2] // ...................................................................................................................................*........................................ - sqrdmulh v3.4S, v14.4S, v29.S[1] // ......................................................................................................................................*..................................... - mul v18.4S, v14.4S, v29.S[0] // .....................................................................................................................................*...................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - srshr v14.4S, v28.4S, #23 // ........................................................................................................................................*................................... - mls v9.4S, v13.4S, v8.S[0] // ................................................................................................................................*........................................... - mls v15.4S, v0.4S, v8.S[0] // ..................................................................................................................................*......................................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mls v12.4S, v25.4S, v8.4S // .............................................................................................................................................*.............................. - mls v17.4S, v27.4S, v8.4S // .........................................................................................................................................*.................................. - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mls v2.4S, v31.4S, v8.4S // .................................................................................................................................*.......................................... - mls v28.4S, v14.4S, v8.4S // ..............................................................................................................................................*............................. - mls v19.4S, v22.4S, v8.S[0] // ............................................................................................................................................*............................... - mls v18.4S, v3.4S, v8.S[0] // ...............................................................................................................................................*............................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - sub v20.4S, v9.4S, v15.4S // ...........................................................................................................................................*................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - sub v13.4S, v2.4S, v17.4S // .................................................................................................................................................*.......................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - sub v29.4S, v12.4S, v28.4S // .......................................................................................................................................................*.................... - sub v0.4S, v19.4S, v18.4S // .....................................................................................................................................................*...................... - add v23.4S, v12.4S, v28.4S // ........................................................................................................................................................*................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mul v10.4S, v20.4S, v5.S[0] // ...................................................................................................................................................*........................ - sqrdmulh v14.4S, v20.4S, v5.S[1] // ....................................................................................................................................................*....................... - mul v24.4S, v13.4S, v5.S[0] // ..........................................................................................................................................................*................. - sqrdmulh v13.4S, v13.4S, v5.S[1] // ...........................................................................................................................................................*................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - sqrdmulh v28.4S, v29.4S, v5.S[1] // ................................................................................................................................................................*........... - mul v27.4S, v29.4S, v5.S[0] // ...............................................................................................................................................................*............ - mul v29.4S, v0.4S, v5.S[0] // .............................................................................................................................................................*.............. - sqrdmulh v0.4S, v0.4S, v5.S[1] // ..............................................................................................................................................................*............. - str q23, [x1, #16] // ..................................................................................................................................................................*......... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - add v12.4S, v9.4S, v15.4S // ..........................................................................................................................................*................................. - add v5.4S, v2.4S, v17.4S // ................................................................................................................................................*........................... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - add v19.4S, v19.4S, v18.4S // ......................................................................................................................................................*..................... - mls v10.4S, v14.4S, v8.S[0] // ............................................................................................................................................................*............... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - mls v24.4S, v13.4S, v8.S[0] // ....................................................................................................................................................................*....... - mls v27.4S, v28.4S, v8.S[0] // .......................................................................................................................................................................*.... - mls v29.4S, v0.4S, v8.S[0] // ......................................................................................................................................................................*..... - str q12, [x1, #32] // ..................................................................................................................................................*......................... - str q5, [x1], #(16*4) // .........................................................................................................................................................*.................. - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - str q19, [x1, #-16] // .................................................................................................................................................................*.......... - add x1, x1, #64 // ...................................................................................................................................................................*........ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - str q10, [x2, #32] // .....................................................................................................................................................................*...... - str q24, [x2], #(16*4) // ........................................................................................................................................................................*... - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - str q27, [x2, #-48] // ..........................................................................................................................................................................*. - str q29, [x2, #-16] // .........................................................................................................................................................................*.. - // gap // ............................................................................................................................................................................ - add x2, x2, #64 // ...........................................................................................................................................................................* - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - // gap // ............................................................................................................................................................................ - - // original source code - // ldr q29, [x1, #0] // ..*......................................................................................................................................................................... - // ldr q19, [x1, #16] // ...*........................................................................................................................................................................ - // ldr q5, [x2, #0] // .....*...................................................................................................................................................................... - // trn1 v9.4S, v0.4S, v18.4S // ............*............................................................................................................................................................... - // trn2 v0.4S, v0.4S, v18.4S // .*.......................................................................................................................................................................... - // ldr q2, [x2, #16] // ......*..................................................................................................................................................................... - // ldr q12, [x2, #32] // *........................................................................................................................................................................... - // ldr q15, [x2, #48] // ....*....................................................................................................................................................................... - // ldr q13, [x5, #32] // .................*.......................................................................................................................................................... - // ldr q18, [x5], #(12*16) // ........*................................................................................................................................................................... - // ldr q27, [x5, #-176] // ................................................*........................................................................................................................... - // ldr q17, [x5, #-144] // ...........*................................................................................................................................................................ - // ldr q31, [x5, #-128] // ......................*..................................................................................................................................................... - // ldr q25, [x5, #-112] // .......................*.................................................................................................................................................... - // trn1 v28.4S, v29.4S, v19.4S // .........*.................................................................................................................................................................. - // trn2 v29.4S, v29.4S, v19.4S // ..........*................................................................................................................................................................. - // ldr q19, [x5, #-64] // ........................*................................................................................................................................................... - // ldr q3, [x5, #-96] // ......................................................*..................................................................................................................... - // ldr q22, [x5, #-80] // .....................................*...................................................................................................................................... - // trn1 v1.4S, v5.4S, v2.4S // ................*........................................................................................................................................................... - // trn2 v5.4S, v5.4S, v2.4S // .............*.............................................................................................................................................................. - // trn1 v2.4S, v12.4S, v15.4S // ..............*............................................................................................................................................................. - // trn2 v12.4S, v12.4S, v15.4S // ...............*............................................................................................................................................................ - // ldr q15, [x5, #-48] // .............................*.............................................................................................................................................. - // ldr q4, [x5, #-32] // .......*.................................................................................................................................................................... - // ldr q16, [x5, #-16] // ..............................*............................................................................................................................................. - // trn2 v20.2D, v28.2D, v9.2D // ..................*......................................................................................................................................................... - // trn1 v9.2D, v28.2D, v9.2D // ...................*........................................................................................................................................................ - // trn2 v28.2D, v29.2D, v0.2D // ....................*....................................................................................................................................................... - // trn1 v29.2D, v29.2D, v0.2D // .....................*...................................................................................................................................................... - // ldr q0, [x4, #32] // .............................................................................*.............................................................................................. - // ldr q11, [x4, #16] // ...............................................................*............................................................................................................ - // ldr q6, [x4], #64 // ...........................................................................................................*................................................................ - // trn2 v23.2D, v1.2D, v2.2D // ............................*............................................................................................................................................... - // trn1 v2.2D, v1.2D, v2.2D // ...........................*................................................................................................................................................ - // trn2 v1.2D, v5.2D, v12.2D // ..........................*................................................................................................................................................. - // trn1 v5.2D, v5.2D, v12.2D // .........................*.................................................................................................................................................. - // ldr q12, [x4, #-16] // ............................................................................................*............................................................................... - // sub v21.4S, v20.4S, v28.4S // ...............................*............................................................................................................................................ - // add v28.4S, v20.4S, v28.4S // ................................*........................................................................................................................................... - // sub v20.4S, v9.4S, v29.4S // .................................*.......................................................................................................................................... - // add v29.4S, v9.4S, v29.4S // .............................................*.............................................................................................................................. - // sub v9.4S, v23.4S, v1.4S // ...................................*........................................................................................................................................ - // add v1.4S, v23.4S, v1.4S // ..............................................*............................................................................................................................. - // sub v23.4S, v2.4S, v5.4S // ..................................*......................................................................................................................................... - // add v5.4S, v2.4S, v5.4S // .....................................................*...................................................................................................................... - // mul v2.4S, v20.4S, v13.4S // ....................................*....................................................................................................................................... - // sqrdmulh v13.4S, v20.4S, v17.4S // ......................................*..................................................................................................................................... - // mul v17.4S, v21.4S, v31.4S // .......................................*.................................................................................................................................... - // sqrdmulh v31.4S, v21.4S, v25.4S // ........................................*................................................................................................................................... - // mul v19.4S, v23.4S, v19.4S // ...........................................*................................................................................................................................ - // sqrdmulh v15.4S, v23.4S, v15.4S // ..........................................*................................................................................................................................. - // mul v25.4S, v9.4S, v4.4S // .........................................*.................................................................................................................................. - // sqrdmulh v9.4S, v9.4S, v16.4S // ............................................*............................................................................................................................... - // sub v4.4S, v5.4S, v1.4S // ..........................................................*................................................................................................................. - // add v5.4S, v5.4S, v1.4S // ....................................................................*....................................................................................................... - // sub v1.4S, v29.4S, v28.4S // ....................................................*....................................................................................................................... - // add v29.4S, v29.4S, v28.4S // .....................................................................*...................................................................................................... - // mls v2.4S, v13.4S, v8.S[0] // ...............................................*............................................................................................................................ - // mls v17.4S, v31.4S, v8.S[0] // .................................................*.......................................................................................................................... - // mls v19.4S, v15.4S, v8.S[0] // ...................................................*........................................................................................................................ - // mls v25.4S, v9.4S, v8.S[0] // ..................................................*......................................................................................................................... - // mul v9.4S, v1.4S, v18.4S // .........................................................*.................................................................................................................. - // sqrdmulh v15.4S, v1.4S, v27.4S // .............................................................*.............................................................................................................. - // mul v13.4S, v4.4S, v3.4S // ..................................................................*......................................................................................................... - // sqrdmulh v31.4S, v4.4S, v22.4S // ................................................................*........................................................................................................... - // sub v28.4S, v2.4S, v17.4S // ........................................................*................................................................................................................... - // add v2.4S, v2.4S, v17.4S // .......................................................*.................................................................................................................... - // sub v17.4S, v19.4S, v25.4S // ...........................................................*................................................................................................................ - // add v19.4S, v19.4S, v25.4S // .........................................................................*.................................................................................................. - // mls v9.4S, v15.4S, v8.S[0] // .......................................................................*.................................................................................................... - // mul v15.4S, v28.4S, v18.4S // ............................................................*............................................................................................................... - // sqrdmulh v18.4S, v28.4S, v27.4S // ..............................................................*............................................................................................................. - // mls v13.4S, v31.4S, v8.S[0] // ........................................................................*................................................................................................... - // trn1 v27.4S, v29.4S, v2.4S // ...........................................................................*................................................................................................ - // mul v31.4S, v17.4S, v3.4S // ...................................................................*........................................................................................................ - // sqrdmulh v17.4S, v17.4S, v22.4S // .................................................................*.......................................................................................................... - // trn2 v29.4S, v29.4S, v2.4S // ............................................................................*............................................................................................... - // trn1 v2.4S, v5.4S, v19.4S // ................................................................................*........................................................................................... - // trn2 v19.4S, v5.4S, v19.4S // .................................................................................*.......................................................................................... - // mls v15.4S, v18.4S, v8.S[0] // ......................................................................*..................................................................................................... - // mls v31.4S, v17.4S, v8.S[0] // ..........................................................................*................................................................................................. - // trn1 v5.4S, v9.4S, v15.4S // ...............................................................................*............................................................................................ - // trn2 v9.4S, v9.4S, v15.4S // ..............................................................................*............................................................................................. - // trn1 v15.4S, v13.4S, v31.4S // ...................................................................................*........................................................................................ - // trn2 v13.4S, v13.4S, v31.4S // ..................................................................................*......................................................................................... - // trn2 v18.2D, v27.2D, v5.2D // ......................................................................................*..................................................................................... - // trn1 v5.2D, v27.2D, v5.2D // .......................................................................................*.................................................................................... - // trn2 v27.2D, v29.2D, v9.2D // .....................................................................................*...................................................................................... - // trn1 v29.2D, v29.2D, v9.2D // ....................................................................................*....................................................................................... - // trn2 v9.2D, v2.2D, v15.2D // ...........................................................................................*................................................................................ - // trn1 v2.2D, v2.2D, v15.2D // ..........................................................................................*................................................................................. - // trn2 v15.2D, v19.2D, v13.2D // ........................................................................................*................................................................................... - // trn1 v19.2D, v19.2D, v13.2D // .........................................................................................*.................................................................................. - // sub v13.4S, v18.4S, v27.4S // .............................................................................................*.............................................................................. - // add v18.4S, v18.4S, v27.4S // ..............................................................................................*............................................................................. - // sub v27.4S, v5.4S, v29.4S // ...............................................................................................*............................................................................ - // add v29.4S, v5.4S, v29.4S // ................................................................................................*........................................................................... - // sub v5.4S, v9.4S, v15.4S // .................................................................................................*.......................................................................... - // add v15.4S, v9.4S, v15.4S // ............................................................................................................*............................................................... - // sub v17.4S, v2.4S, v19.4S // ..................................................................................................*......................................................................... - // add v19.4S, v2.4S, v19.4S // .............................................................................................................*.............................................................. - // mul v2.4S, v13.4S, v0.S[0] // ....................................................................................................*....................................................................... - // sqrdmulh v13.4S, v13.4S, v0.S[1] // ...................................................................................................*........................................................................ - // mul v9.4S, v27.4S, v11.S[2] // ......................................................................................................*..................................................................... - // sqrdmulh v27.4S, v27.4S, v11.S[3] // .....................................................................................................*...................................................................... - // mul v31.4S, v17.4S, v0.S[2] // .........................................................................................................*.................................................................. - // sqrdmulh v0.4S, v17.4S, v0.S[3] // ..........................................................................................................*................................................................. - // mul v17.4S, v5.4S, v12.S[0] // .......................................................................................................*.................................................................... - // sqrdmulh v5.4S, v5.4S, v12.S[1] // ........................................................................................................*................................................................... - // sub v12.4S, v29.4S, v18.4S // ................................................................................................................*........................................................... - // add v29.4S, v29.4S, v18.4S // ......................................................................................................................*..................................................... - // sub v18.4S, v19.4S, v15.4S // ..................................................................................................................*......................................................... - // add v19.4S, v19.4S, v15.4S // ....................................................................................................................*....................................................... - // mls v9.4S, v27.4S, v8.S[0] // ..............................................................................................................*............................................................. - // mls v2.4S, v13.4S, v8.S[0] // ...............................................................................................................*............................................................ - // mls v31.4S, v0.4S, v8.S[0] // .................................................................................................................*.......................................................... - // mls v17.4S, v5.4S, v8.S[0] // ...................................................................................................................*........................................................ - // mul v5.4S, v12.4S, v6.S[2] // .........................................................................................................................*.................................................. - // sqrdmulh v0.4S, v12.4S, v6.S[3] // .....................................................................................................................*...................................................... - // mul v12.4S, v18.4S, v11.S[0] // ............................................................................................................................*............................................... - // sqrdmulh v15.4S, v18.4S, v11.S[1] // .............................................................................................................................*.............................................. - // srshr v13.4S, v29.4S, #23 // ................................................................................................................................*........................................... - // srshr v18.4S, v19.4S, #23 // ..............................................................................................................................*............................................. - // sub v27.4S, v9.4S, v2.4S // .......................................................................................................................*.................................................... - // add v9.4S, v9.4S, v2.4S // ........................................................................................................................*................................................... - // sub v2.4S, v31.4S, v17.4S // ..........................................................................................................................*................................................. - // add v17.4S, v31.4S, v17.4S // ...........................................................................................................................*................................................ - // mls v5.4S, v0.4S, v8.S[0] // ......................................................................................................................................*..................................... - // mls v29.4S, v13.4S, v8.4S // ..........................................................................................................................................*................................. - // mls v12.4S, v15.4S, v8.S[0] // .......................................................................................................................................*.................................... - // mul v0.4S, v27.4S, v6.S[2] // ..................................................................................................................................*......................................... - // sqrdmulh v15.4S, v27.4S, v6.S[3] // .................................................................................................................................*.......................................... - // mul v13.4S, v2.4S, v11.S[0] // ....................................................................................................................................*....................................... - // sqrdmulh v2.4S, v2.4S, v11.S[1] // ...................................................................................................................................*........................................ - // srshr v27.4S, v9.4S, #23 // ...............................................................................................................................*............................................ - // srshr v31.4S, v17.4S, #23 // .....................................................................................................................................*...................................... - // mls v19.4S, v18.4S, v8.4S // .........................................................................................................................................*.................................. - // add v18.4S, v5.4S, v12.4S // ............................................................................................................................................................*............... - // sub v5.4S, v5.4S, v12.4S // ..............................................................................................................................................*............................. - // mls v0.4S, v15.4S, v8.S[0] // ............................................................................................................................................*............................... - // mls v9.4S, v27.4S, v8.4S // ........................................................................................................................................*................................... - // mls v17.4S, v31.4S, v8.4S // ...........................................................................................................................................*................................ - // mls v13.4S, v2.4S, v8.S[0] // .............................................................................................................................................*.............................. - // add v2.4S, v29.4S, v19.4S // .............................................................................................................................................................*.............. - // sub v29.4S, v29.4S, v19.4S // ...............................................................................................................................................*............................ - // str q18, [x1, #32] // ...................................................................................................................................................................*........ - // mul v19.4S, v5.4S, v6.S[0] // ...................................................................................................................................................*........................ - // sqrdmulh v5.4S, v5.4S, v6.S[1] // ....................................................................................................................................................*....................... - // sub v12.4S, v0.4S, v13.4S // .................................................................................................................................................*.......................... - // add v0.4S, v0.4S, v13.4S // ..............................................................................................................................................................*............. - // sub v15.4S, v9.4S, v17.4S // ................................................................................................................................................*........................... - // add v9.4S, v9.4S, v17.4S // ..................................................................................................................................................*......................... - // str q2, [x1], #(16*4) // ....................................................................................................................................................................*....... - // mul v2.4S, v29.4S, v6.S[0] // .....................................................................................................................................................*...................... - // sqrdmulh v29.4S, v29.4S, v6.S[1] // ......................................................................................................................................................*..................... - // mls v19.4S, v5.4S, v8.S[0] // ...............................................................................................................................................................*............ - // mul v5.4S, v12.4S, v6.S[0] // .........................................................................................................................................................*.................. - // sqrdmulh v12.4S, v12.4S, v6.S[1] // ..........................................................................................................................................................*................. - // mul v13.4S, v15.4S, v6.S[0] // ........................................................................................................................................................*................... - // sqrdmulh v15.4S, v15.4S, v6.S[1] // .......................................................................................................................................................*.................... - // str q0, [x1, #-16] // .....................................................................................................................................................................*...... - // str q9, [x1, #-48] // ...........................................................................................................................................................*................ - // add x1, x1, #64 // ......................................................................................................................................................................*..... - // mls v2.4S, v29.4S, v8.S[0] // ................................................................................................................................................................*........... - // str q19, [x2, #32] // .......................................................................................................................................................................*.... - // mls v5.4S, v12.4S, v8.S[0] // ..................................................................................................................................................................*......... - // mls v13.4S, v15.4S, v8.S[0] // .................................................................................................................................................................*.......... - // str q2, [x2], #(16*4) // ........................................................................................................................................................................*... - // str q5, [x2, #-16] // ..........................................................................................................................................................................*. - // str q13, [x2, #-48] // .........................................................................................................................................................................*.. - // add x2, x2, #64 // ...........................................................................................................................................................................* + // Instructions: 164 + // Expected cycles: 54 + // Expected IPC: 3.04 + // + // Wall time: 78.88s + // User time: 78.88s + // + // ------------------------------------------------------------------------ original position ------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------- + trn2 v27.4S, v29.4S, v6.4S // .*.................................................................................................................................................................. + trn1 v23.4S, v29.4S, v6.4S // .....*.............................................................................................................................................................. + trn2 v25.4S, v19.4S, v7.4S // ..*................................................................................................................................................................. + trn1 v7.4S, v19.4S, v7.4S // ......*............................................................................................................................................................. + ldr q9, [x2, #16] // ...*................................................................................................................................................................ + ldr q13, [x2, #0] // ....*............................................................................................................................................................... + ldr q1, [x2, #32] // *................................................................................................................................................................... + // gap // .................................................................................................................................................................... + ldr q3, [x2, #48] // .......*............................................................................................................................................................ + ldr q20, [x5, #-64] // ........*........................................................................................................................................................... + ldr q21, [x5, #-16] // .............*...................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn1 v26.2D, v27.2D, v25.2D // ...........*........................................................................................................................................................ + trn2 v27.2D, v27.2D, v25.2D // ............*....................................................................................................................................................... + trn1 v25.2D, v23.2D, v7.2D // .........*.......................................................................................................................................................... + trn2 v7.2D, v23.2D, v7.2D // ..........*......................................................................................................................................................... + ldr q23, [x5, #-48] // ...........................*........................................................................................................................................ + ldr q18, [x5, #-32] // ................................*................................................................................................................................... + ldr q16, [x5, #-80] // ......................*............................................................................................................................................. + // gap // .................................................................................................................................................................... + ldr q17, [x5, #-176] // .................................*.................................................................................................................................. + ldr q5, [x4, #32] // ....................................................*............................................................................................................... + ldr q4, [x4, #16] // ....................................*............................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v22.4S, v25.4S, v26.4S // ................*................................................................................................................................................... + add v25.4S, v25.4S, v26.4S // .................*.................................................................................................................................................. + sub v26.4S, v7.4S, v27.4S // ...............*.................................................................................................................................................... + trn2 v14.4S, v13.4S, v9.4S // ..............*..................................................................................................................................................... + ldr q19, [x4, #48] // ..........................................................................*......................................................................................... + ldr q29, [x4], #64 // .........................................................*.......................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn1 v9.4S, v13.4S, v9.4S // ....................*............................................................................................................................................... + trn2 v13.4S, v1.4S, v3.4S // ...................*................................................................................................................................................ + trn1 v1.4S, v1.4S, v3.4S // ..................*................................................................................................................................................. + add v27.4S, v7.4S, v27.4S // .....................*.............................................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v7.4S, v22.4S, v0.4S // .......................*............................................................................................................................................ + sqrdmulh v3.4S, v22.4S, v15.4S // ........................*........................................................................................................................................... + sqrdmulh v15.4S, v26.4S, v2.4S // .........................*.......................................................................................................................................... + mul v26.4S, v26.4S, v30.4S // ..........................*......................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v0.2D, v14.2D, v13.2D // ............................*....................................................................................................................................... + trn1 v13.2D, v14.2D, v13.2D // .............................*...................................................................................................................................... + trn2 v2.2D, v9.2D, v1.2D // ..............................*..................................................................................................................................... + trn1 v9.2D, v9.2D, v1.2D // ...............................*.................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v1.4S, v25.4S, v27.4S // ..................................*................................................................................................................................. + add v27.4S, v25.4S, v27.4S // ........................................................*........................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v7.4S, v3.4S, v8.S[0] // .......................................*............................................................................................................................ + mls v26.4S, v15.4S, v8.S[0] // ...................................*................................................................................................................................ + sub v25.4S, v9.4S, v13.4S // .....................................*.............................................................................................................................. + sub v3.4S, v2.4S, v0.4S // ......................................*............................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + add v15.4S, v2.4S, v0.4S // .........................................*.......................................................................................................................... + add v9.4S, v9.4S, v13.4S // ........................................*........................................................................................................................... + mul v13.4S, v1.4S, v24.4S // .................................................*.................................................................................................................. + sqrdmulh v1.4S, v1.4S, v17.4S // ..................................................*................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v20.4S, v25.4S, v20.4S // ...........................................*........................................................................................................................ + sqrdmulh v21.4S, v3.4S, v21.4S // .............................................*...................................................................................................................... + sqrdmulh v23.4S, v25.4S, v23.4S // ..........................................*......................................................................................................................... + mul v25.4S, v3.4S, v18.4S // ............................................*....................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v3.4S, v7.4S, v26.4S // ...............................................*.................................................................................................................... + add v7.4S, v7.4S, v26.4S // ..............................................*..................................................................................................................... + sub v26.4S, v9.4S, v15.4S // ................................................*................................................................................................................... + add v9.4S, v9.4S, v15.4S // .............................................................*...................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v13.4S, v1.4S, v8.S[0] // ............................................................*....................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v24.4S, v3.4S, v24.4S // .......................................................*............................................................................................................ + sqrdmulh v1.4S, v3.4S, v17.4S // .....................................................*.............................................................................................................. + mls v20.4S, v23.4S, v8.S[0] // ......................................................*............................................................................................................. + mls v25.4S, v21.4S, v8.S[0] // ...................................................*................................................................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn1 v23.4S, v27.4S, v7.4S // .........................................................................*.......................................................................................... + trn2 v27.4S, v27.4S, v7.4S // ..................................................................*................................................................................................. + mul v7.4S, v26.4S, v11.4S // ..........................................................*......................................................................................................... + sqrdmulh v3.4S, v26.4S, v16.4S // ...........................................................*........................................................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v24.4S, v1.4S, v8.S[0] // ..............................................................*..................................................................................................... + sub v1.4S, v20.4S, v25.4S // ...............................................................*.................................................................................................... + add v25.4S, v20.4S, v25.4S // ................................................................*................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v7.4S, v3.4S, v8.S[0] // .................................................................*.................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v11.4S, v1.4S, v11.4S // ......................................................................*............................................................................................. + sqrdmulh v1.4S, v1.4S, v16.4S // .....................................................................*.............................................................................................. + trn1 v3.4S, v9.4S, v25.4S // ...................................................................*................................................................................................ + trn2 v25.4S, v9.4S, v25.4S // ....................................................................*............................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn1 v9.4S, v13.4S, v24.4S // ........................................................................*........................................................................................... + trn2 v24.4S, v13.4S, v24.4S // .......................................................................*............................................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v13.2D, v23.2D, v9.2D // ............................................................................*....................................................................................... + trn1 v23.2D, v23.2D, v9.2D // .............................................................................*...................................................................................... + mls v11.4S, v1.4S, v8.S[0] // ...........................................................................*........................................................................................ + trn2 v9.2D, v27.2D, v24.2D // ..............................................................................*..................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn1 v27.2D, v27.2D, v24.2D // ...............................................................................*.................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v24.4S, v13.4S, v9.4S // ................................................................................*................................................................................... + add v9.4S, v13.4S, v9.4S // .................................................................................*.................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn1 v13.4S, v7.4S, v11.4S // ..................................................................................*................................................................................. + trn2 v11.4S, v7.4S, v11.4S // ...................................................................................*................................................................................ + add v7.4S, v23.4S, v27.4S // .....................................................................................*.............................................................................. + sub v27.4S, v23.4S, v27.4S // ..........................................................................................*......................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v23.4S, v24.4S, v5.S[0] // ....................................................................................*............................................................................... + sqrdmulh v24.4S, v24.4S, v5.S[1] // ...........................................................................................*........................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v1.2D, v3.2D, v13.2D // ......................................................................................*............................................................................. + trn1 v13.2D, v3.2D, v13.2D // .........................................................................................*.......................................................................... + trn2 v3.2D, v25.2D, v11.2D // ........................................................................................*........................................................................... + trn1 v11.2D, v25.2D, v11.2D // .......................................................................................*............................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v25.4S, v7.4S, v9.4S // ...............................................................................................................*.................................................... + add v7.4S, v7.4S, v9.4S // ............................................................................................*....................................................................... + mul v9.4S, v27.4S, v4.S[2] // .................................................................................................*.................................................................. + sqrdmulh v27.4S, v27.4S, v4.S[3] // ..................................................................................................*................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v20.4S, v1.4S, v3.4S // .............................................................................................*...................................................................... + add v1.4S, v1.4S, v3.4S // ..............................................................................................*..................................................................... + sub v3.4S, v13.4S, v11.4S // ...............................................................................................*.................................................................... + add v11.4S, v13.4S, v11.4S // ................................................................................................*................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v23.4S, v24.4S, v8.S[0] // ........................................................................................................*........................................................... + mul v24.4S, v25.4S, v29.S[2] // .....................................................................................................................*.............................................. + sqrdmulh v25.4S, v25.4S, v29.S[3] // ......................................................................................................................*............................................. + srshr v13.4S, v7.4S, #23 // ...................................................................................................*................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v21.4S, v3.4S, v5.S[2] // ....................................................................................................*............................................................... + sqrdmulh v3.4S, v3.4S, v5.S[3] // .....................................................................................................*.............................................................. + mul v15.4S, v20.4S, v19.S[0] // ......................................................................................................*............................................................. + sqrdmulh v20.4S, v20.4S, v19.S[1] // .......................................................................................................*............................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v9.4S, v27.4S, v8.S[0] // ...........................................................................................................*........................................................ + sub v27.4S, v11.4S, v1.4S // .........................................................................................................*.......................................................... + add v11.4S, v11.4S, v1.4S // ............................................................................................................*....................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v7.4S, v13.4S, v8.4S // ..........................................................................................................*......................................................... + mls v24.4S, v25.4S, v8.S[0] // ...............................................................................................................................*.................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v21.4S, v3.4S, v8.S[0] // ..............................................................................................................*..................................................... + mls v15.4S, v20.4S, v8.S[0] // .............................................................................................................*...................................................... + srshr v25.4S, v11.4S, #23 // .................................................................................................................*.................................................. + sqrdmulh v13.4S, v27.4S, v4.S[1] // ................................................................................................................*................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v1.4S, v9.4S, v23.4S // ...................................................................................................................*................................................ + add v3.4S, v9.4S, v23.4S // ....................................................................................................................*............................................... + mul v27.4S, v27.4S, v4.S[0] // ..................................................................................................................*................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v11.4S, v25.4S, v8.4S // .........................................................................................................................*.......................................... + sub v23.4S, v21.4S, v15.4S // .......................................................................................................................*............................................ + add v9.4S, v21.4S, v15.4S // ........................................................................................................................*........................................... + mul v21.4S, v1.4S, v29.S[2] // ..........................................................................................................................*......................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v27.4S, v13.4S, v8.S[0] // .................................................................................................................................*.................................. + sqrdmulh v25.4S, v1.4S, v29.S[3] // ...........................................................................................................................*........................................ + srshr v13.4S, v3.4S, #23 // ............................................................................................................................*....................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v20.4S, v23.4S, v4.S[0] // .............................................................................................................................*...................................... + sqrdmulh v23.4S, v23.4S, v4.S[1] // ..............................................................................................................................*..................................... + srshr v1.4S, v9.4S, #23 // ................................................................................................................................*................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + add v15.4S, v7.4S, v11.4S // ...................................................................................................................................*................................ + sub v11.4S, v7.4S, v11.4S // ..................................................................................................................................*................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + add v7.4S, v24.4S, v27.4S // ............................................................................................................................................*....................... + sub v27.4S, v24.4S, v27.4S // ...........................................................................................................................................*........................ + mls v3.4S, v13.4S, v8.4S // .....................................................................................................................................*.............................. + mls v21.4S, v25.4S, v8.S[0] // ....................................................................................................................................*............................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v9.4S, v1.4S, v8.4S // ......................................................................................................................................*............................. + mls v20.4S, v23.4S, v8.S[0] // .......................................................................................................................................*............................ + str q15, [x1], #(16*4) // ..........................................................................................................................................*......................... + mul v24.4S, v11.4S, v29.S[0] // ........................................................................................................................................*........................... + sqrdmulh v11.4S, v11.4S, v29.S[1] // .........................................................................................................................................*.......................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + str q7, [x1, #-32] // ..................................................................................................................................................*................. + mul v7.4S, v27.4S, v29.S[0] // ...................................................................................................................................................*................ + sqrdmulh v27.4S, v27.4S, v29.S[1] // ....................................................................................................................................................*............... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v23.4S, v3.4S, v9.4S // .............................................................................................................................................*...................... + add v9.4S, v3.4S, v9.4S // ...............................................................................................................................................*.................... + sub v25.4S, v21.4S, v20.4S // ..............................................................................................................................................*..................... + add v13.4S, v21.4S, v20.4S // ................................................................................................................................................*................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v24.4S, v11.4S, v8.S[0] // .................................................................................................................................................*.................. + mls v7.4S, v27.4S, v8.S[0] // .............................................................................................................................................................*...... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + str q9, [x1, #-48] // .........................................................................................................................................................*.......... + str q13, [x1, #-16] // ..........................................................................................................................................................*......... + add x1, x1, #64 // ...........................................................................................................................................................*........ + mul v27.4S, v23.4S, v29.S[0] // .....................................................................................................................................................*.............. + sqrdmulh v11.4S, v23.4S, v29.S[1] // ......................................................................................................................................................*............. + mul v23.4S, v25.4S, v29.S[0] // .......................................................................................................................................................*............ + sqrdmulh v25.4S, v25.4S, v29.S[1] // ........................................................................................................................................................*........... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + str q24, [x2], #(16*4) // ............................................................................................................................................................*....... + str q7, [x2, #-32] // ................................................................................................................................................................*... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v27.4S, v11.4S, v8.S[0] // ..............................................................................................................................................................*..... + mls v23.4S, v25.4S, v8.S[0] // ...............................................................................................................................................................*.... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + str q27, [x2, #-48] // .................................................................................................................................................................*.. + str q23, [x2, #-16] // ..................................................................................................................................................................*. + add x2, x2, #64 // ...................................................................................................................................................................* + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + + // -------------------------------------------------------------------------- new position ---------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------- + // ldr q10, [x2, #32] // ......*............................................................................................................................................................. + // trn2 v25.4S, v29.4S, v6.4S // *................................................................................................................................................................... + // trn2 v22.4S, v19.4S, v7.4S // ..*................................................................................................................................................................. + // ldr q14, [x2, #16] // ....*............................................................................................................................................................... + // ldr q20, [x2, #0] // .....*.............................................................................................................................................................. + // trn1 v23.4S, v29.4S, v6.4S // .*.................................................................................................................................................................. + // trn1 v4.4S, v19.4S, v7.4S // ...*................................................................................................................................................................ + // ldr q19, [x2, #48] // .......*............................................................................................................................................................ + // ldr q26, [x5, #-64] // ........*........................................................................................................................................................... + // trn1 v17.2D, v23.2D, v4.2D // ............*....................................................................................................................................................... + // trn2 v18.2D, v23.2D, v4.2D // .............*...................................................................................................................................................... + // trn1 v5.2D, v25.2D, v22.2D // ..........*......................................................................................................................................................... + // trn2 v7.2D, v25.2D, v22.2D // ...........*........................................................................................................................................................ + // ldr q31, [x5, #-16] // .........*.......................................................................................................................................................... + // trn2 v28.4S, v20.4S, v14.4S // .......................*............................................................................................................................................ + // sub v22.4S, v18.4S, v7.4S // ......................*............................................................................................................................................. + // sub v9.4S, v17.4S, v5.4S // ....................*............................................................................................................................................... + // add v16.4S, v17.4S, v5.4S // .....................*.............................................................................................................................................. + // trn1 v21.4S, v10.4S, v19.4S // ............................*....................................................................................................................................... + // trn2 v6.4S, v10.4S, v19.4S // ...........................*........................................................................................................................................ + // trn1 v19.4S, v20.4S, v14.4S // ..........................*......................................................................................................................................... + // add v4.4S, v18.4S, v7.4S // .............................*...................................................................................................................................... + // ldr q14, [x5, #-80] // ................*................................................................................................................................................... + // mul v0.4S, v9.4S, v0.4S // ..............................*..................................................................................................................................... + // sqrdmulh v29.4S, v9.4S, v15.4S // ...............................*.................................................................................................................................... + // sqrdmulh v17.4S, v22.4S, v2.4S // ................................*................................................................................................................................... + // mul v22.4S, v22.4S, v30.4S // .................................*.................................................................................................................................. + // ldr q30, [x5, #-48] // ..............*..................................................................................................................................................... + // trn2 v20.2D, v28.2D, v6.2D // ..................................*................................................................................................................................. + // trn1 v28.2D, v28.2D, v6.2D // ...................................*................................................................................................................................ + // trn2 v10.2D, v19.2D, v21.2D // ....................................*............................................................................................................................... + // trn1 v21.2D, v19.2D, v21.2D // .....................................*.............................................................................................................................. + // ldr q6, [x5, #-32] // ...............*.................................................................................................................................................... + // ldr q18, [x5, #-176] // .................*.................................................................................................................................................. + // sub v12.4S, v16.4S, v4.4S // ......................................*............................................................................................................................. + // mls v22.4S, v17.4S, v8.S[0] // .........................................*.......................................................................................................................... + // ldr q17, [x4, #16] // ...................*................................................................................................................................................ + // sub v13.4S, v21.4S, v28.4S // ..........................................*......................................................................................................................... + // sub v27.4S, v10.4S, v20.4S // ...........................................*........................................................................................................................ + // mls v0.4S, v29.4S, v8.S[0] // ........................................*........................................................................................................................... + // add v2.4S, v21.4S, v28.4S // .............................................*...................................................................................................................... + // add v28.4S, v10.4S, v20.4S // ............................................*....................................................................................................................... + // sqrdmulh v19.4S, v13.4S, v30.4S // ..................................................*................................................................................................................. + // mul v29.4S, v13.4S, v26.4S // ................................................*................................................................................................................... + // mul v30.4S, v27.4S, v6.4S // ...................................................*................................................................................................................ + // sqrdmulh v6.4S, v27.4S, v31.4S // .................................................*.................................................................................................................. + // add v21.4S, v0.4S, v22.4S // .....................................................*.............................................................................................................. + // sub v31.4S, v0.4S, v22.4S // ....................................................*............................................................................................................... + // sub v10.4S, v2.4S, v28.4S // ......................................................*............................................................................................................. + // mul v27.4S, v12.4S, v24.4S // ..............................................*..................................................................................................................... + // sqrdmulh v12.4S, v12.4S, v18.4S // ...............................................*.................................................................................................................... + // mls v30.4S, v6.4S, v8.S[0] // ............................................................*....................................................................................................... + // ldr q6, [x4, #32] // ..................*................................................................................................................................................. + // sqrdmulh v3.4S, v31.4S, v18.4S // ..........................................................*......................................................................................................... + // mls v29.4S, v19.4S, v8.S[0] // ...........................................................*........................................................................................................ + // mul v19.4S, v31.4S, v24.4S // .........................................................*.......................................................................................................... + // add v31.4S, v16.4S, v4.4S // .......................................*............................................................................................................................ + // ldr q16, [x4], #64 // .........................*.......................................................................................................................................... + // mul v22.4S, v10.4S, v11.4S // ...............................................................*.................................................................................................... + // sqrdmulh v10.4S, v10.4S, v14.4S // ................................................................*................................................................................................... + // mls v27.4S, v12.4S, v8.S[0] // ........................................................*........................................................................................................... + // add v12.4S, v2.4S, v28.4S // .......................................................*............................................................................................................ + // mls v19.4S, v3.4S, v8.S[0] // .................................................................*.................................................................................................. + // sub v1.4S, v29.4S, v30.4S // ..................................................................*................................................................................................. + // add v28.4S, v29.4S, v30.4S // ...................................................................*................................................................................................ + // mls v22.4S, v10.4S, v8.S[0] // ....................................................................*............................................................................................... + // trn2 v30.4S, v31.4S, v21.4S // ..............................................................*..................................................................................................... + // trn1 v25.4S, v12.4S, v28.4S // .......................................................................*............................................................................................ + // trn2 v28.4S, v12.4S, v28.4S // ........................................................................*........................................................................................... + // sqrdmulh v10.4S, v1.4S, v14.4S // ......................................................................*............................................................................................. + // mul v5.4S, v1.4S, v11.4S // .....................................................................*.............................................................................................. + // trn2 v26.4S, v27.4S, v19.4S // ..........................................................................*......................................................................................... + // trn1 v27.4S, v27.4S, v19.4S // .........................................................................*.......................................................................................... + // trn1 v19.4S, v31.4S, v21.4S // .............................................................*...................................................................................................... + // ldr q31, [x4, #-16] // ........................*........................................................................................................................................... + // mls v5.4S, v10.4S, v8.S[0] // .............................................................................*...................................................................................... + // trn2 v10.2D, v19.2D, v27.2D // ...........................................................................*........................................................................................ + // trn1 v19.2D, v19.2D, v27.2D // ............................................................................*....................................................................................... + // trn2 v29.2D, v30.2D, v26.2D // ..............................................................................*..................................................................................... + // trn1 v4.2D, v30.2D, v26.2D // ...............................................................................*.................................................................................... + // sub v12.4S, v10.4S, v29.4S // ................................................................................*................................................................................... + // add v23.4S, v10.4S, v29.4S // .................................................................................*.................................................................................. + // trn1 v30.4S, v22.4S, v5.4S // ..................................................................................*................................................................................. + // trn2 v9.4S, v22.4S, v5.4S // ...................................................................................*................................................................................ + // mul v14.4S, v12.4S, v6.S[0] // ......................................................................................*............................................................................. + // add v18.4S, v19.4S, v4.4S // ....................................................................................*............................................................................... + // trn2 v3.2D, v25.2D, v30.2D // ........................................................................................*........................................................................... + // trn1 v10.2D, v28.2D, v9.2D // ...........................................................................................*........................................................................ + // trn2 v7.2D, v28.2D, v9.2D // ..........................................................................................*......................................................................... + // trn1 v11.2D, v25.2D, v30.2D // .........................................................................................*.......................................................................... + // sub v28.4S, v19.4S, v4.4S // .....................................................................................*.............................................................................. + // sqrdmulh v5.4S, v12.4S, v6.S[1] // .......................................................................................*............................................................................ + // add v1.4S, v18.4S, v23.4S // .............................................................................................*...................................................................... + // sub v29.4S, v3.4S, v7.4S // ................................................................................................*................................................................... + // add v27.4S, v3.4S, v7.4S // .................................................................................................*.................................................................. + // sub v12.4S, v11.4S, v10.4S // ..................................................................................................*................................................................. + // add v7.4S, v11.4S, v10.4S // ...................................................................................................*................................................................ + // mul v22.4S, v28.4S, v17.S[2] // ..............................................................................................*..................................................................... + // sqrdmulh v19.4S, v28.4S, v17.S[3] // ...............................................................................................*.................................................................... + // srshr v10.4S, v1.4S, #23 // .......................................................................................................*............................................................ + // mul v30.4S, v12.4S, v6.S[2] // ........................................................................................................*........................................................... + // sqrdmulh v6.4S, v12.4S, v6.S[3] // .........................................................................................................*.......................................................... + // mul v20.4S, v29.4S, v31.S[0] // ..........................................................................................................*......................................................... + // sqrdmulh v15.4S, v29.4S, v31.S[1] // ...........................................................................................................*........................................................ + // mls v14.4S, v5.4S, v8.S[0] // ....................................................................................................*............................................................... + // sub v2.4S, v7.4S, v27.4S // .............................................................................................................*...................................................... + // mls v1.4S, v10.4S, v8.4S // ...............................................................................................................*.................................................... + // mls v22.4S, v19.4S, v8.S[0] // ............................................................................................................*....................................................... + // add v19.4S, v7.4S, v27.4S // ..............................................................................................................*..................................................... + // mls v20.4S, v15.4S, v8.S[0] // ..................................................................................................................*................................................. + // mls v30.4S, v6.4S, v8.S[0] // .................................................................................................................*.................................................. + // sub v12.4S, v18.4S, v23.4S // ............................................................................................*....................................................................... + // sqrdmulh v11.4S, v2.4S, v17.S[1] // ....................................................................................................................*............................................... + // srshr v10.4S, v19.4S, #23 // ...................................................................................................................*................................................ + // mul v29.4S, v2.4S, v17.S[0] // .......................................................................................................................*............................................ + // sub v28.4S, v22.4S, v14.4S // .....................................................................................................................*.............................................. + // add v4.4S, v22.4S, v14.4S // ......................................................................................................................*............................................. + // mul v5.4S, v12.4S, v16.S[2] // .....................................................................................................*.............................................................. + // sqrdmulh v27.4S, v12.4S, v16.S[3] // ......................................................................................................*............................................................. + // sub v31.4S, v30.4S, v20.4S // .........................................................................................................................*.......................................... + // add v18.4S, v30.4S, v20.4S // ..........................................................................................................................*......................................... + // mls v19.4S, v10.4S, v8.4S // ........................................................................................................................*........................................... + // mul v22.4S, v28.4S, v16.S[2] // ...........................................................................................................................*........................................ + // sqrdmulh v14.4S, v28.4S, v16.S[3] // .............................................................................................................................*...................................... + // srshr v28.4S, v4.4S, #23 // ..............................................................................................................................*..................................... + // mul v6.4S, v31.4S, v17.S[0] // ...............................................................................................................................*.................................... + // sqrdmulh v31.4S, v31.4S, v17.S[1] // ................................................................................................................................*................................... + // mls v5.4S, v27.4S, v8.S[0] // ................................................................................................................*................................................... + // srshr v12.4S, v18.4S, #23 // .................................................................................................................................*.................................. + // mls v29.4S, v11.4S, v8.S[0] // ............................................................................................................................*....................................... + // sub v11.4S, v1.4S, v19.4S // ...................................................................................................................................*................................ + // add v10.4S, v1.4S, v19.4S // ..................................................................................................................................*................................. + // mls v22.4S, v14.4S, v8.S[0] // .......................................................................................................................................*............................ + // mls v4.4S, v28.4S, v8.4S // ......................................................................................................................................*............................. + // mls v18.4S, v12.4S, v8.4S // ........................................................................................................................................*........................... + // mls v6.4S, v31.4S, v8.S[0] // .........................................................................................................................................*.......................... + // mul v7.4S, v11.4S, v16.S[0] // ...........................................................................................................................................*........................ + // sqrdmulh v11.4S, v11.4S, v16.S[1] // ............................................................................................................................................*....................... + // str q10, [x1], #(16*4) // ..........................................................................................................................................*......................... + // sub v23.4S, v5.4S, v29.4S // .....................................................................................................................................*.............................. + // add v31.4S, v5.4S, v29.4S // ....................................................................................................................................*............................... + // sub v12.4S, v4.4S, v18.4S // ................................................................................................................................................*................... + // sub v27.4S, v22.4S, v6.4S // ..................................................................................................................................................*................. + // add v13.4S, v4.4S, v18.4S // .................................................................................................................................................*.................. + // add v10.4S, v22.4S, v6.4S // ...................................................................................................................................................*................ + // mls v7.4S, v11.4S, v8.S[0] // ....................................................................................................................................................*............... + // str q31, [x1, #-32] // .............................................................................................................................................*...................... + // mul v31.4S, v23.4S, v16.S[0] // ..............................................................................................................................................*..................... + // sqrdmulh v3.4S, v23.4S, v16.S[1] // ...............................................................................................................................................*.................... + // mul v5.4S, v12.4S, v16.S[0] // .........................................................................................................................................................*.......... + // sqrdmulh v12.4S, v12.4S, v16.S[1] // ..........................................................................................................................................................*......... + // mul v28.4S, v27.4S, v16.S[0] // ...........................................................................................................................................................*........ + // sqrdmulh v27.4S, v27.4S, v16.S[1] // ............................................................................................................................................................*....... + // str q13, [x1, #-48] // ......................................................................................................................................................*............. + // str q10, [x1, #-16] // .......................................................................................................................................................*............ + // add x1, x1, #64 // ........................................................................................................................................................*........... + // str q7, [x2], #(16*4) // .............................................................................................................................................................*...... + // mls v31.4S, v3.4S, v8.S[0] // .....................................................................................................................................................*.............. + // mls v5.4S, v12.4S, v8.S[0] // ...............................................................................................................................................................*.... + // mls v28.4S, v27.4S, v8.S[0] // ................................................................................................................................................................*... + // str q31, [x2, #-32] // ..............................................................................................................................................................*..... + // str q5, [x2, #-48] // .................................................................................................................................................................*.. + // str q28, [x2, #-16] // ..................................................................................................................................................................*. + // add x2, x2, #64 // ...................................................................................................................................................................* // ----------------------------------------------------------------------------- @@ -1641,7 +1693,7 @@ layer45678_start: ASM_LOAD(xtmp, ninv_tw_addr) ld1r {ninv_tw.4s}, [xtmp] - ushr modulus_half.4S, modulus.4S, #1 + ushr modulus_half.4S, consts.4S, #1 neg neg_modulus_half.4S, modulus_half.4S mov count, #8 @@ -1649,818 +1701,830 @@ layer45678_start: load_roots_123 .p2align 2 - ldr q5, [x0, #0] // .*................................................................................. - ldr q19, [x0, #128] // ..*................................................................................ - ldr q9, [x0, #384] // *.................................................................................. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - ldr q12, [x0, #256] // ....*.............................................................................. - ldr q15, [x0, #768] // ...*............................................................................... - ldr q13, [x0, #896] // .....*............................................................................. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - ldr q18, [x0, #640] // ......*............................................................................ - ldr q27, [x0, #512] // .......*........................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sub v17.4S, v5.4S, v19.4S // .........*......................................................................... - add v19.4S, v5.4S, v19.4S // ........*.......................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sub v5.4S, v12.4S, v9.4S // ............*...................................................................... - add v9.4S, v12.4S, v9.4S // .............*..................................................................... - sub v12.4S, v15.4S, v13.4S // ...........*....................................................................... - add v15.4S, v15.4S, v13.4S // ..........*........................................................................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sub v13.4S, v27.4S, v18.4S // .................*................................................................. - add v18.4S, v27.4S, v18.4S // ................*.................................................................. - sqrdmulh v27.4S, v17.4S, v1.S[3] // ..............*.................................................................... - mul v17.4S, v17.4S, v1.S[2] // ...............*................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v28.4S, v5.4S, v2.S[0] // ....................*.............................................................. - sqrdmulh v5.4S, v5.4S, v2.S[1] // .....................*............................................................. - sub v22.4S, v19.4S, v9.4S // ...................*............................................................... - sqrdmulh v20.4S, v12.4S, v3.S[1] // ..................*................................................................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v12.4S, v12.4S, v3.S[0] // .......................*........................................................... - mul v11.4S, v13.4S, v2.S[2] // ......................*............................................................ - sqrdmulh v13.4S, v13.4S, v2.S[3] // ........................*.......................................................... - add v16.4S, v18.4S, v15.4S // .........................*......................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - add v4.4S, v19.4S, v9.4S // ..........................*........................................................ - sub v19.4S, v18.4S, v15.4S // .............................*..................................................... - mul v9.4S, v22.4S, v0.S[2] // ...........................*....................................................... - sqrdmulh v15.4S, v22.4S, v0.S[3] // ............................*...................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v28.4S, v5.4S, v8.S[0] // ...............................*................................................... - mls v17.4S, v27.4S, v8.S[0] // ..............................*.................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v12.4S, v20.4S, v8.S[0] // ................................*.................................................. - mls v11.4S, v13.4S, v8.S[0] // ...................................*............................................... - sqrdmulh v5.4S, v19.4S, v1.S[1] // .................................*................................................. - mul v19.4S, v19.4S, v1.S[0] // ..................................*................................................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v9.4S, v15.4S, v8.S[0] // ....................................*.............................................. - add v15.4S, v4.4S, v16.4S // .....................................*............................................. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sub v13.4S, v17.4S, v28.4S // .......................................*........................................... - add v18.4S, v17.4S, v28.4S // ......................................*............................................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - add v27.4S, v11.4S, v12.4S // .........................................*......................................... - sub v17.4S, v11.4S, v12.4S // ........................................*.......................................... - mls v19.4S, v5.4S, v8.S[0] // ...........................................*....................................... - sqrdmulh v5.4S, v15.4S, v26.4S // ..........................................*........................................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v12.4S, v15.4S, v25.4S // ............................................*...................................... - sqrdmulh v15.4S, v13.4S, v0.S[3] // .............................................*..................................... - mul v28.4S, v13.4S, v0.S[2] // ..............................................*.................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - add v13.4S, v18.4S, v27.4S // .................................................*................................. - sub v18.4S, v18.4S, v27.4S // ..................................................*................................ - mul v27.4S, v17.4S, v1.S[0] // ...............................................*................................... - sqrdmulh v17.4S, v17.4S, v1.S[1] // ................................................*.................................. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - add v22.4S, v9.4S, v19.4S // ...................................................*............................... - sub v9.4S, v9.4S, v19.4S // ....................................................*.............................. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v28.4S, v15.4S, v8.S[0] // .......................................................*........................... - mul v15.4S, v13.4S, v25.4S // ......................................................*............................ - sqrdmulh v20.4S, v13.4S, v26.4S // ........................................................*.......................... - sqrdmulh v11.4S, v18.4S, v0.S[1] // .....................................................*............................. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v12.4S, v5.4S, v8.S[0] // ............................................................*...................... - mul v19.4S, v18.4S, v0.S[0] // .........................................................*......................... - mls v27.4S, v17.4S, v8.S[0] // ...........................................................*....................... - mul v13.4S, v22.4S, v25.4S // ..........................................................*........................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v18.4S, v22.4S, v26.4S // .............................................................*..................... - sqrdmulh v17.4S, v9.4S, v0.S[1] // ..............................................................*.................... - mul v5.4S, v9.4S, v0.S[0] // ...............................................................*................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v15.4S, v20.4S, v8.S[0] // ................................................................*.................. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sub v9.4S, v28.4S, v27.4S // .................................................................*................. - add v27.4S, v28.4S, v27.4S // ..................................................................*................ - mls v19.4S, v11.4S, v8.S[0] // ...................................................................*............... - cmge v24.4S, v31.4S, v12.4S // .....................................................................*............. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v13.4S, v18.4S, v8.S[0] // ......................................................................*............ - mls v5.4S, v17.4S, v8.S[0] // ....................................................................*.............. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v20.4S, v9.4S, v0.S[1] // .........................................................................*......... - mul v9.4S, v9.4S, v0.S[0] // ........................................................................*.......... - sqrdmulh v10.4S, v27.4S, v26.4S // .......................................................................*........... - mul v18.4S, v27.4S, v25.4S // ..........................................................................*........ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - cmge v11.4S, v31.4S, v15.4S // ...........................................................................*....... - cmge v17.4S, v19.4S, v30.4S // ............................................................................*...... - cmge v27.4S, v15.4S, v30.4S // .............................................................................*..... - cmge v23.4S, v31.4S, v19.4S // ..............................................................................*.... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - cmge v6.4S, v5.4S, v30.4S // ...............................................................................*... - cmge v7.4S, v13.4S, v30.4S // ................................................................................*.. - cmge v28.4S, v31.4S, v5.4S // .................................................................................*. - cmge v22.4S, v31.4S, v13.4S // ..................................................................................* - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - - // original source code - // ldr q16, [x0, #384] // ..*................................................................................ - // ldr q17, [x0, #0] // *.................................................................................. - // ldr q23, [x0, #128] // .*................................................................................. - // ldr q27, [x0, #768] // ....*.............................................................................. - // ldr q21, [x0, #256] // ...*............................................................................... - // ldr q28, [x0, #896] // .....*............................................................................. - // ldr q20, [x0, #640] // ......*............................................................................ - // ldr q22, [x0, #512] // .......*........................................................................... - // add v4.4S, v17.4S, v23.4S // .........*......................................................................... - // sub v23.4S, v17.4S, v23.4S // ........*.......................................................................... - // add v17.4S, v27.4S, v28.4S // .............*..................................................................... - // sub v19.4S, v27.4S, v28.4S // ............*...................................................................... - // sub v28.4S, v21.4S, v16.4S // ..........*........................................................................ - // add v15.4S, v21.4S, v16.4S // ...........*....................................................................... - // sqrdmulh v21.4S, v23.4S, v1.S[3] // ................*.................................................................. - // mul v27.4S, v23.4S, v1.S[2] // .................*................................................................. - // add v23.4S, v22.4S, v20.4S // ...............*................................................................... - // sub v10.4S, v22.4S, v20.4S // ..............*.................................................................... - // sqrdmulh v20.4S, v19.4S, v3.S[1] // .....................*............................................................. - // sub v6.4S, v4.4S, v15.4S // ....................*.............................................................. - // mul v13.4S, v28.4S, v2.S[0] // ..................*................................................................ - // sqrdmulh v7.4S, v28.4S, v2.S[1] // ...................*............................................................... - // mul v28.4S, v10.4S, v2.S[2] // .......................*........................................................... - // mul v22.4S, v19.4S, v3.S[0] // ......................*............................................................ - // sqrdmulh v10.4S, v10.4S, v2.S[3] // ........................*.......................................................... - // add v16.4S, v23.4S, v17.4S // .........................*......................................................... - // add v4.4S, v4.4S, v15.4S // ..........................*........................................................ - // mul v15.4S, v6.4S, v0.S[2] // ............................*...................................................... - // sqrdmulh v6.4S, v6.4S, v0.S[3] // .............................*..................................................... - // sub v23.4S, v23.4S, v17.4S // ...........................*....................................................... - // mls v27.4S, v21.4S, v8.S[0] // ...............................*................................................... - // mls v13.4S, v7.4S, v8.S[0] // ..............................*.................................................... - // mls v22.4S, v20.4S, v8.S[0] // ................................*.................................................. - // sqrdmulh v20.4S, v23.4S, v1.S[1] // ..................................*................................................ - // mul v17.4S, v23.4S, v1.S[0] // ...................................*............................................... - // mls v28.4S, v10.4S, v8.S[0] // .................................*................................................. - // mls v15.4S, v6.4S, v8.S[0] // ....................................*.............................................. - // add v11.4S, v4.4S, v16.4S // .....................................*............................................. - // add v10.4S, v27.4S, v13.4S // .......................................*........................................... - // sub v27.4S, v27.4S, v13.4S // ......................................*............................................ - // sub v13.4S, v28.4S, v22.4S // .........................................*......................................... - // add v19.4S, v28.4S, v22.4S // ........................................*.......................................... - // sqrdmulh v23.4S, v11.4S, v26.4S // ...........................................*....................................... - // mls v17.4S, v20.4S, v8.S[0] // ..........................................*........................................ - // mul v12.4S, v11.4S, v25.4S // ............................................*...................................... - // sqrdmulh v28.4S, v27.4S, v0.S[3] // .............................................*..................................... - // mul v22.4S, v27.4S, v0.S[2] // ..............................................*.................................... - // mul v20.4S, v13.4S, v1.S[0] // .................................................*................................. - // sqrdmulh v6.4S, v13.4S, v1.S[1] // ..................................................*................................ - // add v13.4S, v10.4S, v19.4S // ...............................................*................................... - // sub v19.4S, v10.4S, v19.4S // ................................................*.................................. - // add v5.4S, v15.4S, v17.4S // ...................................................*............................... - // sub v21.4S, v15.4S, v17.4S // ....................................................*.............................. - // sqrdmulh v17.4S, v19.4S, v0.S[1] // ........................................................*.......................... - // mul v15.4S, v13.4S, v25.4S // ......................................................*............................ - // mls v22.4S, v28.4S, v8.S[0] // .....................................................*............................. - // sqrdmulh v28.4S, v13.4S, v26.4S // .......................................................*........................... - // mul v19.4S, v19.4S, v0.S[0] // ..........................................................*........................ - // mul v13.4S, v5.4S, v25.4S // ............................................................*...................... - // mls v20.4S, v6.4S, v8.S[0] // ...........................................................*....................... - // mls v12.4S, v23.4S, v8.S[0] // .........................................................*......................... - // sqrdmulh v10.4S, v5.4S, v26.4S // .............................................................*..................... - // sqrdmulh v23.4S, v21.4S, v0.S[1] // ..............................................................*.................... - // mul v5.4S, v21.4S, v0.S[0] // ...............................................................*................... - // mls v15.4S, v28.4S, v8.S[0] // ................................................................*.................. - // sub v7.4S, v22.4S, v20.4S // .................................................................*................. - // add v6.4S, v22.4S, v20.4S // ..................................................................*................ - // mls v19.4S, v17.4S, v8.S[0] // ...................................................................*............... - // mls v5.4S, v23.4S, v8.S[0] // ......................................................................*............ - // cmge v24.4S, v31.4S, v12.4S // ....................................................................*.............. - // mls v13.4S, v10.4S, v8.S[0] // .....................................................................*............. - // sqrdmulh v10.4S, v6.4S, v26.4S // .........................................................................*......... - // mul v9.4S, v7.4S, v0.S[0] // ........................................................................*.......... - // sqrdmulh v20.4S, v7.4S, v0.S[1] // .......................................................................*........... - // mul v18.4S, v6.4S, v25.4S // ..........................................................................*........ - // cmge v11.4S, v31.4S, v15.4S // ...........................................................................*....... - // cmge v17.4S, v19.4S, v30.4S // ............................................................................*...... - // cmge v27.4S, v15.4S, v30.4S // .............................................................................*..... - // cmge v23.4S, v31.4S, v19.4S // ..............................................................................*.... - // cmge v6.4S, v5.4S, v30.4S // ...............................................................................*... - // cmge v7.4S, v13.4S, v30.4S // ................................................................................*.. - // cmge v28.4S, v31.4S, v5.4S // .................................................................................*. - // cmge v22.4S, v31.4S, v13.4S // ..................................................................................* + // Instructions: 78 + // Expected cycles: 25 + // Expected IPC: 3.12 + // + // Wall time: 3.49s + // User time: 3.49s + // + // ----------------------------- original position -----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|-- + ldr q11, [x0, #384] // ..*........................................................................... + ldr q27, [x0, #256] // *............................................................................. + ldr q7, [x0, #896] // .*............................................................................ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + ldr q23, [x0, #768] // ...*.......................................................................... + ldr q24, [x0, #128] // ....*......................................................................... + ldr q9, [x0, #0] // .....*........................................................................ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + ldr q13, [x0, #640] // ......*....................................................................... + ldr q20, [x0, #512] // .......*...................................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v21.4S, v27.4S, v11.4S // ........*..................................................................... + add v27.4S, v27.4S, v11.4S // .........*.................................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v11.4S, v23.4S, v7.4S // .............*................................................................ + add v7.4S, v23.4S, v7.4S // ............*................................................................. + sub v23.4S, v9.4S, v24.4S // ..........*................................................................... + add v24.4S, v9.4S, v24.4S // ...........*.................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v9.4S, v20.4S, v13.4S // .................*............................................................ + add v13.4S, v20.4S, v13.4S // ................*............................................................. + sqrdmulh v20.4S, v21.4S, v2.S[1] // ..............*............................................................... + mul v21.4S, v21.4S, v2.S[0] // ...............*.............................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v15.4S, v11.4S, v3.S[0] // ..................*........................................................... + sqrdmulh v11.4S, v11.4S, v3.S[1] // .....................*........................................................ + add v18.4S, v24.4S, v27.4S // ....................*......................................................... + mul v16.4S, v23.4S, v1.S[2] // ...................*.......................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v23.4S, v23.4S, v1.S[3] // .......................*...................................................... + mul v17.4S, v9.4S, v2.S[2] // ........................*..................................................... + sqrdmulh v9.4S, v9.4S, v2.S[3] // .........................*.................................................... + add v5.4S, v13.4S, v7.4S // ......................*....................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v27.4S, v24.4S, v27.4S // ...........................*.................................................. + sub v7.4S, v13.4S, v7.4S // ............................*................................................. + mls v21.4S, v20.4S, v8.S[0] // ..........................*................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v15.4S, v11.4S, v8.S[0] // ..............................*............................................... + sub v11.4S, v18.4S, v5.4S // .............................*................................................ + add v24.4S, v18.4S, v5.4S // ...............................*.............................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v16.4S, v23.4S, v8.S[0] // ..................................*........................................... + mls v17.4S, v9.4S, v8.S[0] // ...................................*.......................................... + mul v9.4S, v27.4S, v0.S[2] // ................................*............................................. + sqrdmulh v27.4S, v27.4S, v0.S[3] // .................................*............................................ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v13.4S, v11.4S, v0.S[1] // .....................................*........................................ + mul v10.4S, v11.4S, v0.S[0] // .......................................*...................................... + mul v23.4S, v24.4S, v25.4S // ....................................*......................................... + sqrdmulh v11.4S, v24.4S, v26.4S // ......................................*....................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v20.4S, v7.4S, v1.S[0] // ........................................*..................................... + sqrdmulh v7.4S, v7.4S, v1.S[1] // .........................................*.................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + add v24.4S, v16.4S, v21.4S // ...........................................*.................................. + sub v21.4S, v16.4S, v21.4S // ..................................................*........................... + mls v9.4S, v27.4S, v8.S[0] // ............................................*................................. + add v27.4S, v17.4S, v15.4S // ..........................................*................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v18.4S, v17.4S, v15.4S // ........................................................*..................... + mls v10.4S, v13.4S, v8.S[0] // .............................................*................................ + mls v23.4S, v11.4S, v8.S[0] // ..............................................*............................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + add v11.4S, v24.4S, v27.4S // ................................................*............................. + mls v20.4S, v7.4S, v8.S[0] // .................................................*............................ + sub v27.4S, v24.4S, v27.4S // ...............................................*.............................. + sqrdmulh v7.4S, v21.4S, v0.S[3] // ..........................................................*................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v15.4S, v21.4S, v0.S[2] // ...........................................................*.................. + mul v4.4S, v18.4S, v1.S[0] // .............................................................*................ + sqrdmulh v13.4S, v18.4S, v1.S[1] // ...............................................................*.............. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v24.4S, v11.4S, v25.4S // ....................................................*......................... + sqrdmulh v11.4S, v11.4S, v26.4S // .......................................................*...................... + sqrdmulh v21.4S, v27.4S, v0.S[1] // .....................................................*........................ + mul v27.4S, v27.4S, v0.S[0] // ......................................................*....................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v18.4S, v9.4S, v20.4S // .........................................................*.................... + cmge v16.4S, v23.4S, v30.4S // ...................................................*.......................... + cmge v17.4S, v31.4S, v23.4S // ..................................................................*........... + cmge v5.4S, v31.4S, v10.4S // ....................................................................*......... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + add v9.4S, v9.4S, v20.4S // ......................................................................*....... + mls v15.4S, v7.4S, v8.S[0] // ...................................................................*.......... + mls v4.4S, v13.4S, v8.S[0] // .......................................................................*...... + cmge v7.4S, v10.4S, v30.4S // .....................................................................*........ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v24.4S, v11.4S, v8.S[0] // ................................................................*............. + mls v27.4S, v21.4S, v8.S[0] // ............................................................*................. + mul v11.4S, v18.4S, v0.S[0] // ..............................................................*............... + sqrdmulh v13.4S, v18.4S, v0.S[1] // .................................................................*............ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v19.4S, v17.4S, v16.4S // .............................................................................* + sub v6.4S, v5.4S, v7.4S // ...........................................................................*.. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v11.4S, v13.4S, v8.S[0] // ............................................................................*. + cmge v28.4S, v31.4S, v27.4S // ........................................................................*..... + cmge v20.4S, v27.4S, v30.4S // .........................................................................*.... + cmge v22.4S, v24.4S, v30.4S // ..........................................................................*... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + + // ------------------------------- new position --------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|-- + // ldr q16, [x0, #256] // .*............................................................................ + // ldr q12, [x0, #896] // ..*........................................................................... + // ldr q18, [x0, #384] // *............................................................................. + // ldr q5, [x0, #768] // ...*.......................................................................... + // ldr q21, [x0, #128] // ....*......................................................................... + // ldr q20, [x0, #0] // .....*........................................................................ + // ldr q29, [x0, #640] // ......*....................................................................... + // ldr q15, [x0, #512] // .......*...................................................................... + // sub v19.4S, v16.4S, v18.4S // ........*..................................................................... + // add v17.4S, v16.4S, v18.4S // .........*.................................................................... + // sub v23.4S, v20.4S, v21.4S // ............*................................................................. + // add v6.4S, v20.4S, v21.4S // .............*................................................................ + // add v28.4S, v5.4S, v12.4S // ...........*.................................................................. + // sub v21.4S, v5.4S, v12.4S // ..........*................................................................... + // sqrdmulh v5.4S, v19.4S, v2.S[1] // ................*............................................................. + // mul v20.4S, v19.4S, v2.S[0] // .................*............................................................ + // add v19.4S, v15.4S, v29.4S // ...............*.............................................................. + // sub v29.4S, v15.4S, v29.4S // ..............*............................................................... + // mul v15.4S, v21.4S, v3.S[0] // ..................*........................................................... + // mul v27.4S, v23.4S, v1.S[2] // .....................*........................................................ + // add v18.4S, v6.4S, v17.4S // ....................*......................................................... + // sqrdmulh v12.4S, v21.4S, v3.S[1] // ...................*.......................................................... + // add v10.4S, v19.4S, v28.4S // .........................*.................................................... + // sqrdmulh v23.4S, v23.4S, v1.S[3] // ......................*....................................................... + // mul v21.4S, v29.4S, v2.S[2] // .......................*...................................................... + // sqrdmulh v24.4S, v29.4S, v2.S[3] // ........................*..................................................... + // mls v20.4S, v5.4S, v8.S[0] // ............................*................................................. + // sub v6.4S, v6.4S, v17.4S // ..........................*................................................... + // sub v19.4S, v19.4S, v28.4S // ...........................*.................................................. + // sub v16.4S, v18.4S, v10.4S // ..............................*............................................... + // mls v15.4S, v12.4S, v8.S[0] // .............................*................................................ + // add v10.4S, v18.4S, v10.4S // ...............................*.............................................. + // mul v18.4S, v6.4S, v0.S[2] // ..................................*........................................... + // sqrdmulh v12.4S, v6.4S, v0.S[3] // ...................................*.......................................... + // mls v27.4S, v23.4S, v8.S[0] // ................................*............................................. + // mls v21.4S, v24.4S, v8.S[0] // .................................*............................................ + // mul v23.4S, v10.4S, v25.4S // ......................................*....................................... + // sqrdmulh v5.4S, v16.4S, v0.S[1] // ....................................*......................................... + // sqrdmulh v17.4S, v10.4S, v26.4S // .......................................*...................................... + // mul v10.4S, v16.4S, v0.S[0] // .....................................*........................................ + // mul v16.4S, v19.4S, v1.S[0] // ........................................*..................................... + // sqrdmulh v28.4S, v19.4S, v1.S[1] // .........................................*.................................... + // add v22.4S, v21.4S, v15.4S // .............................................*................................ + // add v24.4S, v27.4S, v20.4S // ..........................................*................................... + // mls v18.4S, v12.4S, v8.S[0] // ............................................*................................. + // mls v10.4S, v5.4S, v8.S[0] // ...............................................*.............................. + // mls v23.4S, v17.4S, v8.S[0] // ................................................*............................. + // sub v4.4S, v24.4S, v22.4S // ...................................................*.......................... + // add v12.4S, v24.4S, v22.4S // .................................................*............................ + // mls v16.4S, v28.4S, v8.S[0] // ..................................................*........................... + // sub v20.4S, v27.4S, v20.4S // ...........................................*.................................. + // cmge v17.4S, v23.4S, v30.4S // .............................................................*................ + // mul v24.4S, v12.4S, v25.4S // ........................................................*..................... + // sqrdmulh v22.4S, v4.4S, v0.S[1] // ..........................................................*................... + // mul v27.4S, v4.4S, v0.S[0] // ...........................................................*.................. + // sqrdmulh v28.4S, v12.4S, v26.4S // .........................................................*.................... + // sub v29.4S, v21.4S, v15.4S // ..............................................*............................... + // sub v21.4S, v18.4S, v16.4S // ............................................................*................. + // sqrdmulh v14.4S, v20.4S, v0.S[3] // ....................................................*......................... + // mul v15.4S, v20.4S, v0.S[2] // .....................................................*........................ + // mls v27.4S, v22.4S, v8.S[0] // .....................................................................*........ + // mul v4.4S, v29.4S, v1.S[0] // ......................................................*....................... + // mul v11.4S, v21.4S, v0.S[0] // ......................................................................*....... + // sqrdmulh v12.4S, v29.4S, v1.S[1] // .......................................................*...................... + // mls v24.4S, v28.4S, v8.S[0] // ....................................................................*......... + // sqrdmulh v5.4S, v21.4S, v0.S[1] // .......................................................................*...... + // cmge v19.4S, v31.4S, v23.4S // ..............................................................*............... + // mls v15.4S, v14.4S, v8.S[0] // .................................................................*............ + // cmge v29.4S, v31.4S, v10.4S // ...............................................................*.............. + // cmge v21.4S, v10.4S, v30.4S // ...................................................................*.......... + // add v9.4S, v18.4S, v16.4S // ................................................................*............. + // mls v4.4S, v12.4S, v8.S[0] // ..................................................................*........... + // cmge v28.4S, v31.4S, v27.4S // ...........................................................................*.. + // cmge v20.4S, v27.4S, v30.4S // ............................................................................*. + // cmge v22.4S, v24.4S, v30.4S // .............................................................................* + // sub v6.4S, v29.4S, v21.4S // .........................................................................*.... + // mls v11.4S, v5.4S, v8.S[0] // ..........................................................................*... + // sub v19.4S, v19.4S, v17.4S // ........................................................................*..... sub count, count, #1 layer123_start: - sub v14.4S, v4.4S, v16.4S // ................................................*....................................................................... - ldr q16, [x0, #400] // ...e.................................................................................................................... - sub v4.4S, v23.4S, v17.4S // ..........................................................................*............................................. - ldr q17, [x0, #16] // e....................................................................................................................... - // gap // ........................................................................................................................ - sub v11.4S, v11.4S, v27.4S // ..........................................................................................................*............. - mls v18.4S, v10.4S, v8.S[0] // ...................................................................................................*.................... - ldr q23, [x0, #144] // .e...................................................................................................................... - ldr q27, [x0, #784] // ......e................................................................................................................. - sub v6.4S, v28.4S, v6.4S // ..............................................................................*......................................... - ldr q21, [x0, #272] // ..e..................................................................................................................... - sub v10.4S, v22.4S, v7.4S // ..............................................................................................................*......... - ldr q28, [x0, #912] // .......e................................................................................................................ - // gap // ........................................................................................................................ + // Instructions: 120 + // Expected cycles: 26 + // Expected IPC: 4.62 + // + // Wall time: 966.20s + // User time: 966.20s + // + // -------------------------------------------------- original position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------------- + ldr q16, [x0, #272] // ..e..................................................................................................................... + ldr q12, [x0, #912] // .......e................................................................................................................ + ldr q18, [x0, #400] // ...e.................................................................................................................... + cmge v7.4S, v31.4S, v24.4S // ........................................................................................................*............... // gap // ........................................................................................................................ - cmge v7.4S, v12.4S, v30.4S // .....................................................................................................*.................. - mls v9.4S, v20.4S, v8.S[0] // ...................................................................*.................................................... + sub v17.4S, v28.4S, v20.4S // ..........................................................................*............................................. + sqrdmulh v13.4S, v9.4S, v26.4S // ...............................................................................................*........................ + mul v9.4S, v9.4S, v25.4S // ..............................................................................................*......................... + ldr q5, [x0, #784] // ......e................................................................................................................. + mls v10.4S, v6.4S, v8.4S // .......................................................................*................................................ // gap // ........................................................................................................................ + sub v28.4S, v15.4S, v4.4S // ...............................................................*........................................................ + add v6.4S, v15.4S, v4.4S // ................................................................*....................................................... + mls v23.4S, v19.4S, v8.4S // .......................................................................................................*................ + ldr q21, [x0, #144] // .e...................................................................................................................... + ldr q20, [x0, #16] // e....................................................................................................................... + mls v27.4S, v17.4S, v8.4S // ...........................................................................*............................................ + sub v17.4S, v7.4S, v22.4S // ..........................................................................................................*............. + cmge v4.4S, v11.4S, v30.4S // .............................................................................*.......................................... + cmge v14.4S, v31.4S, v11.4S // ............................................................................*........................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - ldr q20, [x0, #656] // .....e.................................................................................................................. - ldr q22, [x0, #528] // ....e................................................................................................................... - mls v19.4S, v4.4S, v29.4S // ...........................................................................*............................................ - mls v15.4S, v11.4S, v29.4S // ...........................................................................................................*............ + ldr q29, [x0, #656] // .....e.................................................................................................................. + ldr q15, [x0, #528] // ....e................................................................................................................... // gap // ........................................................................................................................ - mls v13.4S, v10.4S, v29.4S // ...............................................................................................................*........ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v11.4S, v14.4S, v0.S[1] // ...................................................*.................................................................... - mul v14.4S, v14.4S, v0.S[0] // ..................................................*..................................................................... // gap // ........................................................................................................................ - sub v24.4S, v24.4S, v7.4S // ......................................................................................................*................. - add v4.4S, v17.4S, v23.4S // .........e.............................................................................................................. - mls v5.4S, v6.4S, v29.4S // ...............................................................................*........................................ - sub v23.4S, v17.4S, v23.4S // ........e............................................................................................................... + mul v7.4S, v28.4S, v0.S[0] // .................................................................*...................................................... + mls v9.4S, v13.4S, v8.S[0] // ................................................................................................*....................... + sqrdmulh v22.4S, v28.4S, v0.S[1] // ..................................................................*..................................................... + mul v13.4S, v6.4S, v25.4S // .................................................................................................*...................... + mls v24.4S, v17.4S, v8.4S // ...........................................................................................................*............ + sub v19.4S, v16.4S, v18.4S // .............e.......................................................................................................... + add v17.4S, v16.4S, v18.4S // ..............e......................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - // gap // ........................................................................................................................ - str q15, [x0, #128] // .....................................................................................................................*.. - add v17.4S, v27.4S, v28.4S // ........................e............................................................................................... - str q19, [x0, #640] // .....................................................................................*.................................. - sub v19.4S, v27.4S, v28.4S // .......................e................................................................................................ - sub v28.4S, v21.4S, v16.4S // .............e.......................................................................................................... - add v15.4S, v21.4S, v16.4S // ..............e......................................................................................................... - // gap // ........................................................................................................................ - // gap // ........................................................................................................................ - sqrdmulh v21.4S, v23.4S, v1.S[3] // ...........e............................................................................................................ - mul v27.4S, v23.4S, v1.S[2] // ..........e............................................................................................................. + sqrdmulh v16.4S, v6.4S, v26.4S // ..................................................................................................*..................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + str q23, [x0], #(16) // ....................................................................................................................*... + sub v23.4S, v20.4S, v21.4S // ........e............................................................................................................... + add v6.4S, v20.4S, v21.4S // .........e.............................................................................................................. + add v28.4S, v5.4S, v12.4S // ........................e............................................................................................... + sub v21.4S, v5.4S, v12.4S // .......................e................................................................................................ + sqrdmulh v5.4S, v19.4S, v2.S[1] // ................e....................................................................................................... + mul v20.4S, v19.4S, v2.S[0] // ...............e........................................................................................................ + add v19.4S, v15.4S, v29.4S // ...................e.................................................................................................... + str q27, [x0, #624] // .....................................................................................*.................................. // gap // ........................................................................................................................ - add v23.4S, v22.4S, v20.4S // ...................e.................................................................................................... - sub v10.4S, v22.4S, v20.4S // ..................e..................................................................................................... - sqrdmulh v20.4S, v19.4S, v3.S[1] // ..........................e............................................................................................. - sub v6.4S, v4.4S, v15.4S // ............................e........................................................................................... - str q13, [x0, #256] // ......................................................................................................................*. - mul v13.4S, v28.4S, v2.S[0] // ...............e........................................................................................................ - sqrdmulh v7.4S, v28.4S, v2.S[1] // ................e....................................................................................................... // gap // ........................................................................................................................ + sub v29.4S, v15.4S, v29.4S // ..................e..................................................................................................... + str q10, [x0, #496] // ....................................................................................*................................... + str q24, [x0, #112] // .....................................................................................................................*.. + mul v15.4S, v21.4S, v3.S[0] // .........................e.............................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v28.4S, v10.4S, v2.S[2] // ....................e................................................................................................... - mul v22.4S, v19.4S, v3.S[0] // .........................e.............................................................................................. - sqrdmulh v10.4S, v10.4S, v2.S[3] // .....................e.................................................................................................. // gap // ........................................................................................................................ + mul v27.4S, v23.4S, v1.S[2] // ..........e............................................................................................................. + add v18.4S, v6.4S, v17.4S // .............................e.......................................................................................... + sqrdmulh v12.4S, v21.4S, v3.S[1] // ..........................e............................................................................................. + add v10.4S, v19.4S, v28.4S // .......................................e................................................................................ + sqrdmulh v23.4S, v23.4S, v1.S[3] // ...........e............................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v16.4S, v23.4S, v17.4S // .......................................e................................................................................ - // gap // ........................................................................................................................ - add v4.4S, v4.4S, v15.4S // .............................e.......................................................................................... - mul v15.4S, v6.4S, v0.S[2] // ..............................e......................................................................................... - sqrdmulh v6.4S, v6.4S, v0.S[3] // ...............................e........................................................................................ - // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mul v21.4S, v29.4S, v2.S[2] // ....................e................................................................................................... + sqrdmulh v24.4S, v29.4S, v2.S[3] // .....................e.................................................................................................. // gap // ........................................................................................................................ - sub v23.4S, v23.4S, v17.4S // ......................................e................................................................................. - mls v27.4S, v21.4S, v8.S[0] // ............e........................................................................................................... + mls v20.4S, v5.4S, v8.S[0] // .................e...................................................................................................... // gap // ........................................................................................................................ - mls v14.4S, v11.4S, v8.S[0] // ....................................................*................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v21.4S, v31.4S, v9.4S // ................................................................................*....................................... + mls v13.4S, v16.4S, v8.S[0] // ...................................................................................................*.................... + cmge v29.4S, v9.4S, v30.4S // .............................................................................................................*.......... + sub v6.4S, v6.4S, v17.4S // ............................e........................................................................................... + sub v19.4S, v19.4S, v28.4S // ......................................e................................................................................. // gap // ........................................................................................................................ - mls v13.4S, v7.4S, v8.S[0] // .................e...................................................................................................... - mls v22.4S, v20.4S, v8.S[0] // ...........................e............................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + sub v16.4S, v18.4S, v10.4S // ................................................e....................................................................... + mls v15.4S, v12.4S, v8.S[0] // ...........................e............................................................................................ + add v10.4S, v18.4S, v10.4S // .................................................e...................................................................... + mul v18.4S, v6.4S, v0.S[2] // ..............................e......................................................................................... + sqrdmulh v12.4S, v6.4S, v0.S[3] // ...............................e........................................................................................ + mls v27.4S, v23.4S, v8.S[0] // ............e........................................................................................................... // gap // ........................................................................................................................ - sqrdmulh v20.4S, v23.4S, v1.S[1] // .........................................e.............................................................................. - mul v17.4S, v23.4S, v1.S[0] // ........................................e............................................................................... - mls v28.4S, v10.4S, v8.S[0] // ......................e................................................................................................. - mls v12.4S, v24.4S, v29.4S // .......................................................................................................*................ - mls v15.4S, v6.4S, v8.S[0] // ................................e....................................................................................... - cmge v6.4S, v9.4S, v30.4S // .................................................................................*...................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v21.4S, v24.4S, v8.S[0] // ......................e................................................................................................. + mul v23.4S, v10.4S, v25.4S // ........................................................................................e............................... // gap // ........................................................................................................................ - add v11.4S, v4.4S, v16.4S // .................................................e...................................................................... + sqrdmulh v5.4S, v16.4S, v0.S[1] // ...................................................e.................................................................... // gap // ........................................................................................................................ - add v10.4S, v27.4S, v13.4S // ..................................e..................................................................................... + sqrdmulh v17.4S, v10.4S, v26.4S // .........................................................................................e.............................. // gap // ........................................................................................................................ + mul v10.4S, v16.4S, v0.S[0] // ..................................................e..................................................................... // gap // ........................................................................................................................ + cmge v6.4S, v31.4S, v9.4S // ............................................................................................................*........... + mls v7.4S, v22.4S, v8.S[0] // ...................................................................*.................................................... // gap // ........................................................................................................................ - cmge v24.4S, v14.4S, v30.4S // .....................................................................*.................................................. - cmge v7.4S, v31.4S, v14.4S // ....................................................................*................................................... - sub v27.4S, v27.4S, v13.4S // .................................e...................................................................................... - sub v13.4S, v28.4S, v22.4S // ...........................................e............................................................................ // gap // ........................................................................................................................ - add v19.4S, v28.4S, v22.4S // ............................................e........................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mul v16.4S, v19.4S, v1.S[0] // ........................................e............................................................................... + sqrdmulh v28.4S, v19.4S, v1.S[1] // .........................................e.............................................................................. + add v22.4S, v21.4S, v15.4S // ............................................e........................................................................... // gap // ........................................................................................................................ - sqrdmulh v23.4S, v11.4S, v26.4S // .........................................................................................e.............................. - mls v17.4S, v20.4S, v8.S[0] // ..........................................e............................................................................. - str q12, [x0], #(16) // ....................................................................................................................*... - mul v12.4S, v11.4S, v25.4S // ........................................................................................e............................... - sqrdmulh v28.4S, v27.4S, v0.S[3] // ....................................e................................................................................... - mul v22.4S, v27.4S, v0.S[2] // ...................................e.................................................................................... - sub v27.4S, v21.4S, v6.4S // ..................................................................................*..................................... + add v24.4S, v27.4S, v20.4S // ..................................e..................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q5, [x0, #752] // ......................................................................................*................................. - mul v20.4S, v13.4S, v1.S[0] // .............................................e.......................................................................... - sqrdmulh v6.4S, v13.4S, v1.S[1] // ..............................................e......................................................................... + cmge v19.4S, v13.4S, v30.4S // .................................................................................................................*...... + mls v18.4S, v12.4S, v8.S[0] // ................................e....................................................................................... + mls v10.4S, v5.4S, v8.S[0] // ....................................................e................................................................... // gap // ........................................................................................................................ - add v13.4S, v10.4S, v19.4S // ......................................................e................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v19.4S, v10.4S, v19.4S // .....................................................e.................................................................. - cmge v10.4S, v18.4S, v30.4S // .................................................................................................................*...... - cmge v11.4S, v31.4S, v18.4S // ................................................................................................................*....... // gap // ........................................................................................................................ + mls v23.4S, v17.4S, v8.S[0] // ..........................................................................................e............................. + sub v29.4S, v6.4S, v29.4S // ..............................................................................................................*......... + sub v17.4S, v14.4S, v4.4S // ..............................................................................*......................................... // gap // ........................................................................................................................ + cmge v6.4S, v7.4S, v30.4S // .................................................................................*...................................... // gap // ........................................................................................................................ + sub v4.4S, v24.4S, v22.4S // .....................................................e.................................................................. // gap // ........................................................................................................................ - add v5.4S, v15.4S, v17.4S // ...........................................................e............................................................ - sub v21.4S, v15.4S, v17.4S // ..........................................................e............................................................. - sqrdmulh v17.4S, v19.4S, v0.S[1] // ........................................................e............................................................... - mul v15.4S, v13.4S, v25.4S // ...........................................................................................e............................ - mls v22.4S, v28.4S, v8.S[0] // .....................................e.................................................................................. // gap // ........................................................................................................................ + add v12.4S, v24.4S, v22.4S // ......................................................e................................................................. + mls v16.4S, v28.4S, v8.S[0] // ..........................................e............................................................................. + cmge v5.4S, v31.4S, v13.4S // ................................................................................................................*....... + cmge v14.4S, v31.4S, v7.4S // ................................................................................*....................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v28.4S, v13.4S, v26.4S // ............................................................................................e........................... // gap // ........................................................................................................................ - mul v19.4S, v19.4S, v0.S[0] // .......................................................e................................................................ // gap // ........................................................................................................................ - sub v11.4S, v11.4S, v10.4S // ..................................................................................................................*..... + sub v20.4S, v27.4S, v20.4S // .................................e...................................................................................... + mls v11.4S, v17.4S, v8.4S // ...............................................................................*........................................ + cmge v17.4S, v23.4S, v30.4S // .....................................................................................................e.................. + mul v24.4S, v12.4S, v25.4S // ...........................................................................................e............................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v13.4S, v5.4S, v25.4S // ..............................................................................................e......................... - mls v20.4S, v6.4S, v8.S[0] // ...............................................e........................................................................ - mls v12.4S, v23.4S, v8.S[0] // ..........................................................................................e............................. // gap // ........................................................................................................................ + sqrdmulh v22.4S, v4.4S, v0.S[1] // ........................................................e............................................................... + mul v27.4S, v4.4S, v0.S[0] // .......................................................e................................................................ + sqrdmulh v28.4S, v12.4S, v26.4S // ............................................................................................e........................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v10.4S, v5.4S, v26.4S // ...............................................................................................e........................ - sqrdmulh v23.4S, v21.4S, v0.S[1] // .............................................................e.......................................................... - mul v5.4S, v21.4S, v0.S[0] // ............................................................e........................................................... - mls v9.4S, v27.4S, v29.4S // ...................................................................................*.................................... - mls v15.4S, v28.4S, v8.S[0] // .............................................................................................e.......................... - sub v27.4S, v7.4S, v24.4S // ......................................................................*................................................. - mls v18.4S, v11.4S, v29.4S // ...................................................................................................................*.... // gap // ........................................................................................................................ + mls v9.4S, v29.4S, v8.4S // ...............................................................................................................*........ + sub v29.4S, v21.4S, v15.4S // ...........................................e............................................................................ + sub v21.4S, v18.4S, v16.4S // ..........................................................e............................................................. + sub v5.4S, v5.4S, v19.4S // ..................................................................................................................*..... + sub v19.4S, v14.4S, v6.4S // ..................................................................................*..................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + sqrdmulh v14.4S, v20.4S, v0.S[3] // ....................................e................................................................................... + mul v15.4S, v20.4S, v0.S[2] // ...................................e.................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v27.4S, v22.4S, v8.S[0] // .........................................................e.............................................................. + mul v4.4S, v29.4S, v1.S[0] // .............................................e.......................................................................... + str q11, [x0, #752] // ......................................................................................*................................. + mul v11.4S, v21.4S, v0.S[0] // ............................................................e........................................................... + sqrdmulh v12.4S, v29.4S, v1.S[1] // ..............................................e......................................................................... // gap // ........................................................................................................................ - sub v7.4S, v22.4S, v20.4S // ...............................................................e........................................................ - add v6.4S, v22.4S, v20.4S // ................................................................e....................................................... - mls v19.4S, v17.4S, v8.S[0] // .........................................................e.............................................................. - mls v5.4S, v23.4S, v8.S[0] // ..............................................................e......................................................... + mls v13.4S, v5.4S, v8.4S // ...................................................................................................................*.... // gap // ........................................................................................................................ - mls v14.4S, v27.4S, v29.4S // .......................................................................*................................................ + mls v24.4S, v28.4S, v8.S[0] // .............................................................................................e.......................... + sqrdmulh v5.4S, v21.4S, v0.S[1] // .............................................................e.......................................................... // gap // ........................................................................................................................ + mls v7.4S, v19.4S, v8.4S // ...................................................................................*.................................... // gap // ........................................................................................................................ + cmge v19.4S, v31.4S, v23.4S // ....................................................................................................e................... // gap // ........................................................................................................................ - cmge v24.4S, v31.4S, v12.4S // ....................................................................................................e................... - mls v13.4S, v10.4S, v8.S[0] // ................................................................................................e....................... - str q9, [x0, #880] // .......................................................................................*................................ - str q18, [x0, #368] // .......................................................................................................................* // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v10.4S, v6.4S, v26.4S // ..................................................................................................e..................... - mul v9.4S, v7.4S, v0.S[0] // .................................................................e...................................................... - sqrdmulh v20.4S, v7.4S, v0.S[1] // ..................................................................e..................................................... - mul v18.4S, v6.4S, v25.4S // .................................................................................................e...................... + str q9, [x0, #240] // ......................................................................................................................*. + mls v15.4S, v14.4S, v8.S[0] // .....................................e.................................................................................. + cmge v29.4S, v31.4S, v10.4S // ....................................................................e................................................... + cmge v21.4S, v10.4S, v30.4S // .....................................................................e.................................................. + add v9.4S, v18.4S, v16.4S // ...........................................................e............................................................ // gap // ........................................................................................................................ - cmge v11.4S, v31.4S, v15.4S // ........................................................................................................e............... // gap // ........................................................................................................................ - cmge v17.4S, v19.4S, v30.4S // .........................................................................e.............................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v27.4S, v15.4S, v30.4S // .........................................................................................................e.............. - cmge v23.4S, v31.4S, v19.4S // ........................................................................e............................................... - cmge v6.4S, v5.4S, v30.4S // .............................................................................e.......................................... - str q14, [x0, #496] // ....................................................................................*................................... - // gap // ........................................................................................................................ - // gap // ........................................................................................................................ - // gap // ........................................................................................................................ - cmge v7.4S, v13.4S, v30.4S // .............................................................................................................e.......... - cmge v28.4S, v31.4S, v5.4S // ............................................................................e........................................... - cmge v22.4S, v31.4S, v13.4S // ............................................................................................................e........... - - // original source code - // ldr q9, [x0, #0] // ..e....................................................................................................................|..e................................................................................................................. - // ldr q10, [x0, #(1*(1024/8))] // .....e.................................................................................................................|.....e.............................................................................................................. - // ldr q11, [x0, #(2*(1024/8))] // ........e..............................................................................................................|........e........................................................................................................... - // ldr q12, [x0, #(3*(1024/8))] // e......................................................................................................................|e................................................................................................................... - // ldr q13, [x0, #(4*(1024/8))] // ..............e........................................................................................................|..............e..................................................................................................... - // ldr q14, [x0, #(5*(1024/8))] // .............e.........................................................................................................|.............e...................................................................................................... - // ldr q15, [x0, #(6*(1024/8))] // ......e................................................................................................................|......e............................................................................................................. - // ldr q16, [x0, #(7*(1024/8))] // ..........e............................................................................................................|..........e......................................................................................................... - // sub v24.4s, v9.4s, v10.4s // .......................e...............................................................................................|.......................e............................................................................................ - // add v9.4s, v9.4s, v10.4s // .....................e.................................................................................................|.....................e.............................................................................................. - // mul v10.4s, v24.4s, v1.s[2] // ...............................e.......................................................................................|...............................e.................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..............................e........................................................................................|..............................e..................................................................................... - // mls v10.4s, v24.4s, v8.s[0] // ...............................................e.......................................................................|...............................................e.................................................................... - // sub v24.4s, v11.4s, v12.4s // ............................e..........................................................................................|............................e....................................................................................... - // add v11.4s, v11.4s, v12.4s // .............................e.........................................................................................|.............................e...................................................................................... - // mul v12.4s, v24.4s, v2.s[0] // .....................................e.................................................................................|.....................................e.............................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ......................................e................................................................................|......................................e............................................................................. - // mls v12.4s, v24.4s, v8.s[0] // ..................................................e....................................................................|..................................................e................................................................. - // sub v24.4s, v13.4s, v14.4s // .................................e.....................................................................................|.................................e.................................................................................. - // add v13.4s, v13.4s, v14.4s // ................................e......................................................................................|................................e................................................................................... - // mul v14.4s, v24.4s, v2.s[2] // .......................................e...............................................................................|.......................................e............................................................................ - // sqrdmulh v24.4s, v24.4s, v2.s[3] // .........................................e.............................................................................|.........................................e.......................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ......................................................e................................................................|......................................................e............................................................. - // sub v24.4s, v15.4s, v16.4s // ...........................e...........................................................................................|...........................e........................................................................................ - // add v15.4s, v15.4s, v16.4s // .........................e.............................................................................................|.........................e.......................................................................................... - // mul v16.4s, v24.4s, v3.s[0] // ........................................e..............................................................................|........................................e........................................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ..................................e....................................................................................|..................................e................................................................................. - // mls v16.4s, v24.4s, v8.s[0] // ...................................................e...................................................................|...................................................e................................................................ - // sub v24.4s, v9.4s, v11.4s // ...................................e...................................................................................|...................................e................................................................................ - // add v9.4s, v9.4s, v11.4s // ...........................................e...........................................................................|...........................................e........................................................................ - // mul v11.4s, v24.4s, v0.s[2] // ............................................e..........................................................................|............................................e....................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................e.........................................................................|.............................................e...................................................................... - // mls v11.4s, v24.4s, v8.s[0] // ........................................................e..............................................................|........................................................e........................................................... - // sub v24.4s, v10.4s, v12.4s // ..............................................................e........................................................|..............................................................e..................................................... - // add v10.4s, v10.4s, v12.4s // ...........................................................e...........................................................|...........................................................e........................................................ - // mul v12.4s, v24.4s, v0.s[2] // ......................................................................e................................................|......................................................................e............................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....................................................................e.................................................|.....................................................................e.............................................. - // mls v12.4s, v24.4s, v8.s[0] // ...................................................................................e...................................|...................................................................................e................................ - // sub v24.4s, v13.4s, v15.4s // ..............................................e........................................................................|..............................................e..................................................................... - // add v13.4s, v13.4s, v15.4s // ..........................................e............................................................................|..........................................e......................................................................... - // mul v15.4s, v24.4s, v1.s[0] // .....................................................e.................................................................|.....................................................e.............................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ....................................................e..................................................................|....................................................e............................................................... - // mls v15.4s, v24.4s, v8.s[0] // ..................................................................e....................................................|..................................................................e................................................. - // sub v24.4s, v14.4s, v16.4s // ...............................................................e.......................................................|...............................................................e.................................................... - // add v14.4s, v14.4s, v16.4s // ................................................................e......................................................|................................................................e................................................... - // mul v16.4s, v24.4s, v1.s[0] // .........................................................................e.............................................|.........................................................................e.......................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................................................................e............................................|..........................................................................e......................................... - // mls v16.4s, v24.4s, v8.s[0] // ........................................................................................e..............................|........................................................................................e........................... - // sub v24.4s, v9.4s, v13.4s // .......................................................................................................................*.................................................................................................................... - // add v9.4s, v9.4s, v13.4s // ..........................................................e............................................................|..........................................................e......................................................... - // mul v13.4s, v24.4s, v0.s[0] // ...................*...................................................................................................|...................*................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................*....................................................................................................|..................*................................................................................................. - // mls v13.4s, v24.4s, v8.s[0] // ................................................*......................................................................|................................................*................................................................... - // sub v24.4s, v10.4s, v14.4s // ............................................................................e..........................................|............................................................................e....................................... - // add v10.4s, v10.4s, v14.4s // ...........................................................................e...........................................|...........................................................................e........................................ - // mul v14.4s, v24.4s, v0.s[0] // .....................................................................................e.................................|.....................................................................................e.............................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................................................................e.....................................|.................................................................................e.................................. - // mls v14.4s, v24.4s, v8.s[0] // ...................................................................................................e...................|...................................................................................................e................ - // sub v24.4s, v11.4s, v15.4s // ................................................................................e......................................|................................................................................e................................... - // add v11.4s, v11.4s, v15.4s // ...............................................................................e.......................................|...............................................................................e.................................... - // mul v15.4s, v24.4s, v0.s[0] // ............................................................................................e..........................|............................................................................................e....................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................................................e...........................|...........................................................................................e........................ - // mls v15.4s, v24.4s, v8.s[0] // ....................................................................................................e..................|....................................................................................................e............... - // sub v24.4s, v12.4s, v16.4s // .................................................................................................e.....................|.................................................................................................e.................. - // add v12.4s, v12.4s, v16.4s // ..................................................................................................e....................|..................................................................................................e................. - // mul v16.4s, v24.4s, v0.s[0] // ...........................................................................................................e...........|...........................................................................................................e........ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................e..........|............................................................................................................e....... - // mls v16.4s, v24.4s, v8.s[0] // ............*..........................................................................................................|............*....................................................................................................... - // cmge v27.4s, v31.4s, v13.4s // .............................................................*.........................................................|.............................................................*...................................................... - // cmge v28.4s, v13.4s, v30.4s // ............................................................*..........................................................|............................................................*....................................................... - // sub v28.4s, v27.4s, v28.4s // ...............................................................................................*.......................|...............................................................................................*.................... - // mls v13.4s, v28.4s, v29.4s // .....................................................................................................*.................|.....................................................................................................*.............. - // cmge v27.4s, v31.4s, v14.4s // .................................................................................................................e.....|.................................................................................................................e.. - // cmge v28.4s, v14.4s, v30.4s // ...............................................................................................................e.......|...............................................................................................................e.... - // sub v28.4s, v27.4s, v28.4s // .*.....................................................................................................................|.*.................................................................................................................. - // mls v14.4s, v28.4s, v29.4s // ...............*.......................................................................................................|...............*.................................................................................................... - // cmge v27.4s, v31.4s, v15.4s // .....................................................................................................................e.|.................................................................................................................... - // cmge v28.4s, v15.4s, v30.4s // ..................................................................................................................e....|..................................................................................................................e. - // sub v28.4s, v27.4s, v28.4s // .......*...............................................................................................................|.......*............................................................................................................ - // mls v15.4s, v28.4s, v29.4s // ......................*................................................................................................|......................*............................................................................................. - // cmge v27.4s, v31.4s, v16.4s // .................................................*.....................................................................|.................................................*.................................................................. - // cmge v28.4s, v16.4s, v30.4s // .........................................................*.............................................................|.........................................................*.......................................................... - // sub v28.4s, v27.4s, v28.4s // .......................................................................*...............................................|.......................................................................*............................................ - // mls v16.4s, v28.4s, v29.4s // .............................................................................................*.........................|.............................................................................................*...................... - // str q13, [x0, #(4*(1024/8))] // ...................................................................................................................*...|...................................................................................................................* - // str q14, [x0, #(5*(1024/8))] // ..........................*............................................................................................|..........................*......................................................................................... - // str q15, [x0, #(6*(1024/8))] // ........................................................................*..............................................|........................................................................*........................................... - // str q16, [x0, #(7*(1024/8))] // ........................................................................................................*..............|........................................................................................................*........... - // mul v13.4s, v9.4s, v25.4s // ....................................................................e..................................................|....................................................................e............................................... - // sqrdmulh v9.4s, v9.4s, v26.4s // .................................................................e.....................................................|.................................................................e.................................................. - // mls v13.4s, v9.4s, v8.s[0] // .........................................................................................e.............................|.........................................................................................e.......................... - // mul v14.4s, v10.4s, v25.4s // ..................................................................................e....................................|..................................................................................e................................. - // sqrdmulh v10.4s, v10.4s, v26.4s // ....................................................................................e..................................|....................................................................................e............................... - // mls v14.4s, v10.4s, v8.s[0] // ..............................................................................................e........................|..............................................................................................e..................... - // mul v15.4s, v11.4s, v25.4s // .......................................................................................e...............................|.......................................................................................e............................ - // sqrdmulh v11.4s, v11.4s, v26.4s // ..........................................................................................e............................|..........................................................................................e......................... - // mls v15.4s, v11.4s, v8.s[0] // .......................................................................................................e...............|.......................................................................................................e............ - // mul v16.4s, v12.4s, v25.4s // .............................................................................................................e.........|.............................................................................................................e...... - // sqrdmulh v12.4s, v12.4s, v26.4s // ..........................................................................................................e............|..........................................................................................................e......... - // mls v16.4s, v12.4s, v8.s[0] // ....*..................................................................................................................|....*............................................................................................................... - // cmge v27.4s, v31.4s, v13.4s // ......................................................................................................e................|......................................................................................................e............. - // cmge v28.4s, v13.4s, v30.4s // ...........*...........................................................................................................|...........*........................................................................................................ - // sub v28.4s, v27.4s, v28.4s // ....................*..................................................................................................|....................*............................................................................................... - // mls v13.4s, v28.4s, v29.4s // .......................................................*...............................................................|.......................................................*............................................................ - // cmge v27.4s, v31.4s, v14.4s // ..............................................................................................................e........|..............................................................................................................e..... - // cmge v28.4s, v14.4s, v30.4s // ................................................................................................................e......|................................................................................................................e... - // sub v28.4s, v27.4s, v28.4s // ...*...................................................................................................................|...*................................................................................................................ - // mls v14.4s, v28.4s, v29.4s // ................*......................................................................................................|................*................................................................................................... - // cmge v27.4s, v31.4s, v15.4s // ......................................................................................................................e|.................................................................................................................... - // cmge v28.4s, v15.4s, v30.4s // ....................................................................................................................e..|.................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // .........*.............................................................................................................|.........*.......................................................................................................... - // mls v15.4s, v28.4s, v29.4s // .................*.....................................................................................................|.................*.................................................................................................. - // cmge v27.4s, v31.4s, v16.4s // ..............................................................................*........................................|..............................................................................*..................................... - // cmge v28.4s, v16.4s, v30.4s // .............................................................................*.........................................|.............................................................................*...................................... - // sub v28.4s, v27.4s, v28.4s // ......................................................................................*................................|......................................................................................*............................. - // mls v16.4s, v28.4s, v29.4s // ................................................................................................*......................|................................................................................................*................... - // str q13, [x0], #(16) // ...................................................................*...................................................|...................................................................*................................................ - // str q14, [x0, #(-16 + 1*(1024/8))] // ........................*..............................................................................................|........................*........................................................................................... - // str q15, [x0, #(-16 + 2*(1024/8))] // ....................................*..................................................................................|....................................*............................................................................... - // str q16, [x0, #(-16 + 3*(1024/8))] // .........................................................................................................*.............|.........................................................................................................*.......... + mls v4.4S, v12.4S, v8.S[0] // ...............................................e........................................................................ + cmge v28.4S, v31.4S, v27.4S // ........................................................................e............................................... + cmge v20.4S, v27.4S, v30.4S // .........................................................................e.............................................. + cmge v22.4S, v24.4S, v30.4S // .........................................................................................................e.............. + sub v6.4S, v29.4S, v21.4S // ......................................................................e................................................. + mls v11.4S, v5.4S, v8.S[0] // ..............................................................e......................................................... + str q7, [x0, #880] // .......................................................................................*................................ + str q13, [x0, #368] // .......................................................................................................................* + sub v19.4S, v19.4S, v17.4S // ......................................................................................................e................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + + // ---------------------------------------------------------------------------------------------------------------- new position ----------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------- + // ldr q9, [x0, #0] // .............e..........................................................................................................'............~......................................................................................................... + // ldr q10, [x0, #(1*(1024/8))] // ............e...........................................................................................................'...........~.......................................................................................................... + // ldr q11, [x0, #(2*(1024/8))] // e.......................................................................................................................~...................................................................................................................... + // ldr q12, [x0, #(3*(1024/8))] // ..e.....................................................................................................................'.~.................................................................................................................... + // ldr q13, [x0, #(4*(1024/8))] // ...................e....................................................................................................'..................~................................................................................................... + // ldr q14, [x0, #(5*(1024/8))] // ..................e.....................................................................................................'.................~.................................................................................................... + // ldr q15, [x0, #(6*(1024/8))] // .......e................................................................................................................'......~............................................................................................................... + // ldr q16, [x0, #(7*(1024/8))] // .e......................................................................................................................'~..................................................................................................................... + // sub v24.4s, v9.4s, v10.4s // .............................e..........................................................................................'............................~......................................................................................... + // add v9.4s, v9.4s, v10.4s // ..............................e.........................................................................................'.............................~........................................................................................ + // mul v10.4s, v24.4s, v1.s[2] // .........................................e..............................................................................'........................................~............................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .............................................e..........................................................................'............................................~......................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ..........................................................e.............................................................'.........................................................~............................................................ + // sub v24.4s, v11.4s, v12.4s // .........................e..............................................................................................'........................~............................................................................................. + // add v11.4s, v11.4s, v12.4s // ..........................e.............................................................................................'.........................~............................................................................................ + // mul v12.4s, v24.4s, v2.s[0] // ..................................e.....................................................................................'.................................~.................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .................................e......................................................................................'................................~..................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ................................................e.......................................................................'...............................................~...................................................................... + // sub v24.4s, v13.4s, v14.4s // .....................................e..................................................................................'....................................~................................................................................. + // add v13.4s, v13.4s, v14.4s // ...................................e....................................................................................'..................................~................................................................................... + // mul v14.4s, v24.4s, v2.s[2] // ..............................................e.........................................................................'.............................................~........................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...............................................e........................................................................'..............................................~....................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ...........................................................e............................................................'..........................................................~........................................................... + // sub v24.4s, v15.4s, v16.4s // ................................e.......................................................................................'...............................~...................................................................................... + // add v15.4s, v15.4s, v16.4s // ...............................e........................................................................................'..............................~....................................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ........................................e...............................................................................'.......................................~.............................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...........................................e............................................................................'..........................................~........................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ......................................................e.................................................................'.....................................................~................................................................ + // sub v24.4s, v9.4s, v11.4s // ...................................................e....................................................................'..................................................~................................................................... + // add v9.4s, v9.4s, v11.4s // ..........................................e.............................................................................'.........................................~............................................................................ + // mul v11.4s, v24.4s, v0.s[2] // ........................................................e...............................................................'.......................................................~.............................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................e..............................................................'........................................................~............................................................. + // mls v11.4s, v24.4s, v8.s[0] // .......................................................................e................................................'......................................................................~............................................... + // sub v24.4s, v10.4s, v12.4s // ..................................................................................e.....................................'.................................................................................~.................................... + // add v10.4s, v10.4s, v12.4s // .....................................................................e..................................................'....................................................................~................................................. + // mul v12.4s, v24.4s, v0.s[2] // ...............................................................................................e........................'..............................................................................................~....................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..............................................................................................e.........................'.............................................................................................~........................ + // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................................................e............'..........................................................................................................~........... + // sub v24.4s, v13.4s, v15.4s // ....................................................e...................................................................'...................................................~.................................................................. + // add v13.4s, v13.4s, v15.4s // ............................................e...........................................................................'...........................................~.......................................................................... + // mul v15.4s, v24.4s, v1.s[0] // ..................................................................e.....................................................'.................................................................~.................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................e....................................................'..................................................................~................................................... + // mls v15.4s, v24.4s, v8.s[0] // ...............................................................................e........................................'..............................................................................~....................................... + // sub v24.4s, v14.4s, v16.4s // ..........................................................................................e.............................'.........................................................................................~............................ + // add v14.4s, v14.4s, v16.4s // ....................................................................e...................................................'...................................................................~.................................................. + // mul v16.4s, v24.4s, v1.s[0] // .................................................................................................e......................'................................................................................................~..................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ....................................................................................................e...................'...................................................................................................~.................. + // mls v16.4s, v24.4s, v8.s[0] // ...............................................................................................................e........'..............................................................................................................~....... + // sub v24.4s, v9.4s, v13.4s // .....................................................e..................................................................'....................................................~................................................................. + // add v9.4s, v9.4s, v13.4s // .......................................................e................................................................'......................................................~............................................................... + // mul v13.4s, v24.4s, v0.s[0] // ...............................................................e........................................................'..............................................................~....................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................e..........................................................'............................................................~......................................................... + // mls v13.4s, v24.4s, v8.s[0] // ........................................................................e...............................................'.......................................................................~.............................................. + // sub v24.4s, v10.4s, v14.4s // .............................................................................e..........................................'............................................................................~......................................... + // add v10.4s, v10.4s, v14.4s // ..............................................................................e.........................................'.............................................................................~........................................ + // mul v14.4s, v24.4s, v0.s[0] // .......................................................................................e................................'......................................................................................~............................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................................e.................................'.....................................................................................~................................ + // mls v14.4s, v24.4s, v8.s[0] // ................................................................................................e.......................'...............................................................................................~...................... + // sub v24.4s, v11.4s, v15.4s // ...........................................................................................e............................'..........................................................................................~........................... + // add v11.4s, v11.4s, v15.4s // ..............................................................................................................e.........'.............................................................................................................~........ + // mul v15.4s, v24.4s, v0.s[0] // ...................................................................................................e....................'..................................................................................................~................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................e................'......................................................................................................~............... + // mls v15.4s, v24.4s, v8.s[0] // ....................................................................................................................e...'...................................................................................................................~.. + // sub v24.4s, v12.4s, v16.4s // .........~..............................................................................................................'........*............................................................................................................. + // add v12.4s, v12.4s, v16.4s // ..........~.............................................................................................................'.........*............................................................................................................ + // mul v16.4s, v24.4s, v0.s[0] // ....................~...................................................................................................'...................*.................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................~.................................................................................................'.....................*................................................................................................ + // mls v16.4s, v24.4s, v8.s[0] // .................................................................~......................................................'................................................................*..................................................... + // cmge v27.4s, v31.4s, v13.4s // ............................................................................................................e...........'...........................................................................................................~.......... + // cmge v28.4s, v13.4s, v30.4s // .............................................................................................................e..........'............................................................................................................~......... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................e....'..................................................................................................................~... + // mls v13.4s, v28.4s, v8.4s // ........~...............................................................................................................'.......*.............................................................................................................. + // cmge v27.4s, v31.4s, v14.4s // ................................................................................................................e.......'...............................................................................................................~...... + // cmge v28.4s, v14.4s, v30.4s // .................................................................................................................e......'................................................................................................................~..... + // sub v28.4s, v27.4s, v28.4s // ....~...................................................................................................................'...*.................................................................................................................. + // mls v14.4s, v28.4s, v8.4s // ..............~.........................................................................................................'.............*........................................................................................................ + // cmge v27.4s, v31.4s, v15.4s // .................~......................................................................................................'................*..................................................................................................... + // cmge v28.4s, v15.4s, v30.4s // ................~.......................................................................................................'...............*...................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...........................................................................~............................................'..........................................................................*........................................... + // mls v15.4s, v28.4s, v8.4s // ...................................................................................~....................................'..................................................................................*................................... + // cmge v27.4s, v31.4s, v16.4s // .................................................................................~......................................'................................................................................*..................................... + // cmge v28.4s, v16.4s, v30.4s // ............................................................................~...........................................'...........................................................................*.......................................... + // sub v28.4s, v27.4s, v28.4s // .............................................................................................~..........................'............................................................................................*......................... + // mls v16.4s, v28.4s, v8.4s // ........................................................................................................~...............'.......................................................................................................*.............. + // str q13, [x0, #(4*(1024/8))] // ......................................~.................................................................................'.....................................*................................................................................ + // str q14, [x0, #(5*(1024/8))] // ....................................~...................................................................................'...................................*.................................................................................. + // str q15, [x0, #(6*(1024/8))] // ..................................................................................................~.....................'.................................................................................................*.................... + // str q16, [x0, #(7*(1024/8))] // .....................................................................................................................~..'....................................................................................................................*. + // mul v13.4s, v9.4s, v25.4s // ............................................................e...........................................................'...........................................................~.......................................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ..............................................................e.........................................................'.............................................................~........................................................ + // mls v13.4s, v9.4s, v8.s[0] // .........................................................................e..............................................'........................................................................~............................................. + // mul v14.4s, v10.4s, v25.4s // .....................................................................................e..................................'....................................................................................~................................. + // sqrdmulh v10.4s, v10.4s, v26.4s // ........................................................................................e...............................'.......................................................................................~.............................. + // mls v14.4s, v10.4s, v8.s[0] // ......................................................................................................e.................'.....................................................................................................~................ + // mul v15.4s, v11.4s, v25.4s // ......~.................................................................................................................'.....*................................................................................................................ + // sqrdmulh v11.4s, v11.4s, v26.4s // .....~..................................................................................................................'....*................................................................................................................. + // mls v15.4s, v11.4s, v8.s[0] // .....................~..................................................................................................'....................*................................................................................................. + // mul v16.4s, v12.4s, v25.4s // .......................~................................................................................................'......................*............................................................................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ...........................~............................................................................................'..........................*........................................................................................... + // mls v16.4s, v12.4s, v8.s[0] // .................................................~......................................................................'................................................*..................................................................... + // cmge v27.4s, v31.4s, v13.4s // .........................................................................................................e..............'........................................................................................................~............. + // cmge v28.4s, v13.4s, v30.4s // ....................................................................................e...................................'...................................................................................~.................................. + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................e'...................................................................................................................... + // mls v13.4s, v28.4s, v8.4s // ...........~............................................................................................................'..........*........................................................................................................... + // cmge v27.4s, v31.4s, v14.4s // ...~....................................................................................................................'..*................................................................................................................... + // cmge v28.4s, v14.4s, v30.4s // ..................................................................................................................e.....'.................................................................................................................~.... + // sub v28.4s, v27.4s, v28.4s // ...............~........................................................................................................'..............*....................................................................................................... + // mls v14.4s, v28.4s, v8.4s // ........................~...............................................................................................'.......................*.............................................................................................. + // cmge v27.4s, v31.4s, v15.4s // ................................................................~.......................................................'...............................................................*...................................................... + // cmge v28.4s, v15.4s, v30.4s // ..................................................~.....................................................................'.................................................*.................................................................... + // sub v28.4s, v27.4s, v28.4s // ..........................................................................~.............................................'.........................................................................*............................................ + // mls v15.4s, v28.4s, v8.4s // .........................................................................................~..............................'........................................................................................*............................. + // cmge v27.4s, v31.4s, v16.4s // ................................................................................~.......................................'...............................................................................*...................................... + // cmge v28.4s, v16.4s, v30.4s // ......................................................................~.................................................'.....................................................................*................................................ + // sub v28.4s, v27.4s, v28.4s // ............................................................................................~...........................'...........................................................................................*.......................... + // mls v16.4s, v28.4s, v8.4s // .....................................................................................................~..................'....................................................................................................*................. + // str q13, [x0], #(16) // ............................~...........................................................................................'...........................*.......................................................................................... + // str q14, [x0, #(-16 + 1*(1024/8))] // .......................................~................................................................................'......................................*............................................................................... + // str q15, [x0, #(-16 + 2*(1024/8))] // ..........................................................................................................~.............'.........................................................................................................*............ + // str q16, [x0, #(-16 + 3*(1024/8))] // ......................................................................................................................~.'.....................................................................................................................* sub count, count, #1 cbnz count, layer123_start - sub v16.4S, v4.4S, v16.4S // *.................................... - mls v18.4S, v10.4S, v8.S[0] // ...*................................. - cmge v14.4S, v12.4S, v30.4S // ......*.............................. - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - sub v17.4S, v23.4S, v17.4S // .*................................... - sub v10.4S, v28.4S, v6.4S // ....*................................ - sub v7.4S, v22.4S, v7.4S // .....*............................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - sub v14.4S, v24.4S, v14.4S // .............*....................... - sqrdmulh v24.4S, v16.4S, v0.S[1] // ...........*......................... - mul v6.4S, v16.4S, v0.S[0] // ............*........................ - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - mls v5.4S, v10.4S, v29.4S // ..............*...................... - sub v10.4S, v11.4S, v27.4S // ..*.................................. - mls v9.4S, v20.4S, v8.S[0] // .......*............................. - mls v19.4S, v17.4S, v29.4S // ........*............................ - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - mls v12.4S, v14.4S, v29.4S // ....................*................ - mls v13.4S, v7.4S, v29.4S // ..........*.......................... - cmge v7.4S, v18.4S, v30.4S // ...........................*......... - cmge v14.4S, v31.4S, v18.4S // ............................*........ - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - mls v15.4S, v10.4S, v29.4S // .........*........................... - mls v6.4S, v24.4S, v8.S[0] // ..................*.................. - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - str q5, [x0, #768] // ..........................*.......... - str q19, [x0, #640] // ................*.................... - sub v14.4S, v14.4S, v7.4S // .............................*....... - cmge v7.4S, v31.4S, v9.4S // ...................*................. - cmge v19.4S, v9.4S, v30.4S // .....................*............... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - str q13, [x0, #256] // .................*................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - mls v18.4S, v14.4S, v29.4S // ................................*.... - str q15, [x0, #128] // ...............*..................... - sub v10.4S, v7.4S, v19.4S // .........................*........... - cmge v7.4S, v6.4S, v30.4S // ......................*.............. - cmge v14.4S, v31.4S, v6.4S // .......................*............. - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - str q12, [x0], #(16) // ........................*............ - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - mls v9.4S, v10.4S, v29.4S // ..............................*...... - sub v10.4S, v14.4S, v7.4S // ...............................*..... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - str q18, [x0, #368] // ...................................*. - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - mls v6.4S, v10.4S, v29.4S // .................................*... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - str q9, [x0, #880] // ..................................*.. - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - str q6, [x0, #496] // ....................................* - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - - // original source code - // sub v14.4S, v4.4S, v16.4S // *.................................... - // sub v4.4S, v23.4S, v17.4S // ...*................................. - // sub v11.4S, v11.4S, v27.4S // ..........*.......................... - // mls v18.4S, v10.4S, v8.S[0] // .*................................... - // sub v6.4S, v28.4S, v6.4S // ....*................................ - // sub v10.4S, v22.4S, v7.4S // .....*............................... - // cmge v7.4S, v12.4S, v30.4S // ..*.................................. - // mls v9.4S, v20.4S, v8.S[0] // ...........*......................... - // mls v19.4S, v4.4S, v29.4S // ............*........................ - // mls v15.4S, v11.4S, v29.4S // .................*................... - // mls v13.4S, v10.4S, v29.4S // ..............*...................... - // sqrdmulh v11.4S, v14.4S, v0.S[1] // .......*............................. - // mul v14.4S, v14.4S, v0.S[0] // ........*............................ - // sub v24.4S, v24.4S, v7.4S // ......*.............................. - // mls v5.4S, v6.4S, v29.4S // .........*........................... - // str q15, [x0, #128] // ..........................*.......... - // str q19, [x0, #640] // ....................*................ - // str q13, [x0, #256] // ........................*............ - // mls v14.4S, v11.4S, v8.S[0] // ..................*.................. - // cmge v21.4S, v31.4S, v9.4S // ......................*.............. - // mls v12.4S, v24.4S, v29.4S // .............*....................... - // cmge v6.4S, v9.4S, v30.4S // .......................*............. - // cmge v24.4S, v14.4S, v30.4S // ............................*........ - // cmge v7.4S, v31.4S, v14.4S // .............................*....... - // str q12, [x0], #(16) // ..............................*...... - // sub v27.4S, v21.4S, v6.4S // ...........................*......... - // str q5, [x0, #752] // ...................*................. - // cmge v10.4S, v18.4S, v30.4S // ...............*..................... - // cmge v11.4S, v31.4S, v18.4S // ................*.................... - // sub v11.4S, v11.4S, v10.4S // .....................*............... - // mls v9.4S, v27.4S, v29.4S // ...............................*..... - // sub v27.4S, v7.4S, v24.4S // ................................*.... - // mls v18.4S, v11.4S, v29.4S // .........................*........... - // mls v14.4S, v27.4S, v29.4S // ..................................*.. - // str q9, [x0, #880] // ...................................*. - // str q18, [x0, #368] // .................................*... - // str q14, [x0, #496] // ....................................* + // Instructions: 42 + // Expected cycles: 16 + // Expected IPC: 2.62 + // + // Wall time: 0.44s + // User time: 0.44s + // + // ----------- original position -----------> + // 0 25 + // |------------------------|---------------- + add v21.4S, v15.4S, v4.4S // ......*................................... + sqrdmulh v17.4S, v9.4S, v26.4S // ..*....................................... + mul v16.4S, v9.4S, v25.4S // ...*...................................... + sub v15.4S, v15.4S, v4.4S // .....*.................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + cmge v13.4S, v11.4S, v30.4S // ..........*............................... + cmge v7.4S, v31.4S, v11.4S // ...........*.............................. + cmge v5.4S, v31.4S, v24.4S // *......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mul v4.4S, v21.4S, v25.4S // ...............*.......................... + sqrdmulh v9.4S, v21.4S, v26.4S // .................*........................ + sqrdmulh v21.4S, v15.4S, v0.S[1] // ..............*........................... + mul v15.4S, v15.4S, v0.S[0] // ............*............................. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v10.4S, v6.4S, v8.4S // ....*..................................... + mls v23.4S, v19.4S, v8.4S // .......*.................................. + mls v16.4S, v17.4S, v8.S[0] // .............*............................ + sub v7.4S, v7.4S, v13.4S // ............................*............. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sub v18.4S, v5.4S, v22.4S // .........*................................ + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v15.4S, v21.4S, v8.S[0] // .........................*................ + sub v21.4S, v28.4S, v20.4S // .*........................................ + mls v11.4S, v7.4S, v8.4S // ................................*......... + mls v4.4S, v9.4S, v8.S[0] // ......................*................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v24.4S, v18.4S, v8.4S // ................*......................... + str q10, [x0, #512] // ....................*..................... + cmge v13.4S, v16.4S, v30.4S // .......................*.................. + cmge v10.4S, v31.4S, v16.4S // ........................*................. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v27.4S, v21.4S, v8.4S // ........*................................. + str q23, [x0], #(16) // ..................*....................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q11, [x0, #752] // ....................................*..... + cmge v9.4S, v4.4S, v30.4S // ..........................*............... + cmge v11.4S, v31.4S, v4.4S // ..............................*........... + cmge v7.4S, v15.4S, v30.4S // .............................*............ + cmge v23.4S, v31.4S, v15.4S // ...............................*.......... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q24, [x0, #112] // .....................*.................... + sub v24.4S, v10.4S, v13.4S // ...........................*.............. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q27, [x0, #624] // ...................*...................... + sub v11.4S, v11.4S, v9.4S // ..................................*....... + sub v27.4S, v23.4S, v7.4S // ...................................*...... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v16.4S, v24.4S, v8.4S // .................................*........ + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v15.4S, v27.4S, v8.4S // ......................................*... + mls v4.4S, v11.4S, v8.4S // .....................................*.... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q16, [x0, #240] // .......................................*.. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q15, [x0, #880] // ........................................*. + str q4, [x0, #368] // .........................................* + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + + // ------------- new position --------------> + // 0 25 + // |------------------------|---------------- + // cmge v7.4S, v31.4S, v24.4S // ......*................................... + // sub v17.4S, v28.4S, v20.4S // .................*........................ + // sqrdmulh v13.4S, v9.4S, v26.4S // .*........................................ + // mul v9.4S, v9.4S, v25.4S // ..*....................................... + // mls v10.4S, v6.4S, v8.4S // ...........*.............................. + // sub v28.4S, v15.4S, v4.4S // ...*...................................... + // add v6.4S, v15.4S, v4.4S // *......................................... + // mls v23.4S, v19.4S, v8.4S // ............*............................. + // mls v27.4S, v17.4S, v8.4S // ........................*................. + // sub v17.4S, v7.4S, v22.4S // ...............*.......................... + // cmge v4.4S, v11.4S, v30.4S // ....*..................................... + // cmge v14.4S, v31.4S, v11.4S // .....*.................................... + // mul v7.4S, v28.4S, v0.S[0] // ..........*............................... + // mls v9.4S, v13.4S, v8.S[0] // .............*............................ + // sqrdmulh v22.4S, v28.4S, v0.S[1] // .........*................................ + // mul v13.4S, v6.4S, v25.4S // .......*.................................. + // mls v24.4S, v17.4S, v8.4S // ....................*..................... + // sqrdmulh v16.4S, v6.4S, v26.4S // ........*................................. + // str q23, [x0], #(16) // .........................*................ + // str q27, [x0, #624] // .................................*........ + // str q10, [x0, #496] // .....................*.................... + // str q24, [x0, #112] // ...............................*.......... + // mls v13.4S, v16.4S, v8.S[0] // ...................*...................... + // cmge v29.4S, v9.4S, v30.4S // ......................*................... + // cmge v6.4S, v31.4S, v9.4S // .......................*.................. + // mls v7.4S, v22.4S, v8.S[0] // ................*......................... + // cmge v19.4S, v13.4S, v30.4S // ...........................*.............. + // sub v29.4S, v6.4S, v29.4S // ................................*......... + // sub v17.4S, v14.4S, v4.4S // ..............*........................... + // cmge v6.4S, v7.4S, v30.4S // .............................*............ + // cmge v5.4S, v31.4S, v13.4S // ............................*............. + // cmge v14.4S, v31.4S, v7.4S // ..............................*........... + // mls v11.4S, v17.4S, v8.4S // ..................*....................... + // mls v9.4S, v29.4S, v8.4S // ....................................*..... + // sub v5.4S, v5.4S, v19.4S // ..................................*....... + // sub v19.4S, v14.4S, v6.4S // ...................................*...... + // str q11, [x0, #752] // ..........................*............... + // mls v13.4S, v5.4S, v8.4S // ......................................*... + // mls v7.4S, v19.4S, v8.4S // .....................................*.... + // str q9, [x0, #240] // .......................................*.. + // str q7, [x0, #880] // ........................................*. + // str q13, [x0, #368] // .........................................* pop_stack diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s index 58ae551..fa7a305 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s @@ -67,7 +67,7 @@ xtmp1 .req x11 cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s - vmls \a, \tmp2, modulus + vmls \a, \tmp2, consts .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -76,12 +76,6 @@ xtmp1 .req x11 mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.4s, \a\().4s, \b\().4s add \a\().4s, \a\().4s, \b\().4s @@ -193,7 +187,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -204,7 +198,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -214,7 +208,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -222,7 +216,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -233,19 +227,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -371,8 +365,6 @@ _intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm: consts .req v8 qform_consts .req q8 - modulus .req v29 - ASM_LOAD(r_ptr0, roots_l345) ASM_LOAD(r_ptr1, roots_l67) @@ -395,974 +387,1006 @@ _intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm: qform_root3_tw .req q7 .p2align 2 - ldr q28, [x1, #16] // ...*............................................................................................................................................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - ldr q22, [x1, #0] // ..*.............................................................................................................................................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - ldr q23, [x1, #32] // .*............................................................................................................................................................... - ldr q14, [x1, #48] // *................................................................................................................................................................ - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - ldr q27, [x2, #0] // .....................*........................................................................................................................................... - ldr q29, [x2, #16] // ...................*............................................................................................................................................. - ldr q16, [x5, #128] // ......*.......................................................................................................................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - ldr q11, [x5, #160] // ...........*..................................................................................................................................................... - ldr q2, [x5, #144] // ..............*.................................................................................................................................................. - ldr q30, [x2, #48] // ................*................................................................................................................................................ - trn2 v18.4S, v22.4S, v28.4S // .............*................................................................................................................................................... - trn1 v10.4S, v22.4S, v28.4S // ............*.................................................................................................................................................... - // gap // ................................................................................................................................................................. - trn1 v26.4S, v23.4S, v14.4S // ..........*...................................................................................................................................................... - ldr q13, [x2, #32] // ...............*................................................................................................................................................. - trn2 v12.4S, v23.4S, v14.4S // ........*........................................................................................................................................................ - // gap // ................................................................................................................................................................. - trn2 v20.4S, v27.4S, v29.4S // ...................................*............................................................................................................................. - trn1 v15.4S, v27.4S, v29.4S // ....................................*............................................................................................................................ - ldr q17, [x5, #32] // ...............................*................................................................................................................................. - trn2 v29.2D, v10.2D, v26.2D // ..................*.............................................................................................................................................. - ldr q7, [x5, #80] // .....*........................................................................................................................................................... - trn2 v21.2D, v18.2D, v12.2D // .................*............................................................................................................................................... - ldr q31, [x5, #64] // .............................*................................................................................................................................... - ldr q27, [x4, #16] // ........................................................*........................................................................................................ - ldr q5, [x5, #48] // .......................*......................................................................................................................................... - trn1 v14.2D, v18.2D, v12.2D // ....................*............................................................................................................................................ - trn1 v25.2D, v10.2D, v26.2D // ......................*.......................................................................................................................................... - trn2 v26.4S, v13.4S, v30.4S // ..............................*.................................................................................................................................. - sub v4.4S, v29.4S, v21.4S // ........................*........................................................................................................................................ - ldr q28, [x5, #112] // .......*......................................................................................................................................................... - ldr q12, [x5, #96] // ..........................*...................................................................................................................................... - ldr q6, [x5], #(12*16) // ........................................*........................................................................................................................ - // gap // ................................................................................................................................................................. - add v29.4S, v29.4S, v21.4S // .........................*....................................................................................................................................... - sub v23.4S, v25.4S, v14.4S // ...........................*..................................................................................................................................... - mul v9.4S, v4.4S, v31.4S // .......................................*......................................................................................................................... - ldr q3, [x4], #64 // .................................*............................................................................................................................... - // gap // ................................................................................................................................................................. - add v1.4S, v25.4S, v14.4S // ............................*.................................................................................................................................... - sqrdmulh v24.4S, v4.4S, v7.4S // ............................................*.................................................................................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - trn1 v13.4S, v13.4S, v30.4S // ................................*................................................................................................................................ - mul v10.4S, v23.4S, v17.4S // ..........................................*...................................................................................................................... - sqrdmulh v30.4S, v23.4S, v5.4S // .....................................*........................................................................................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - trn1 v23.2D, v15.2D, v13.2D // .........................................*....................................................................................................................... - ldr q31, [x5, #-16] // ....*............................................................................................................................................................ - ldr q22, [x5, #-176] // .........*....................................................................................................................................................... - trn1 v7.2D, v20.2D, v26.2D // ......................................*.......................................................................................................................... - mls v9.4S, v24.4S, v8.S[0] // .................................................*............................................................................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - trn2 v14.2D, v15.2D, v13.2D // .............................................*................................................................................................................... - mls v10.4S, v30.4S, v8.S[0] // ................................................*................................................................................................................ - trn2 v25.2D, v20.2D, v26.2D // ...........................................*..................................................................................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - sub v18.4S, v1.4S, v29.4S // ...............................................*................................................................................................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - sub v30.4S, v23.4S, v7.4S // ..............................................*.................................................................................................................. - add v26.4S, v14.4S, v25.4S // ...................................................................*............................................................................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - add v24.4S, v23.4S, v7.4S // .....................................................*........................................................................................................... - sub v21.4S, v10.4S, v9.4S // .........................................................*....................................................................................................... - mul v5.4S, v30.4S, v16.4S // ......................................................*.......................................................................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - sub v23.4S, v24.4S, v26.4S // .........................................................................*....................................................................................... - mul v17.4S, v18.4S, v6.4S // ...................................................*............................................................................................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mul v0.4S, v21.4S, v6.4S // ................................................................*................................................................................................ - sqrdmulh v20.4S, v21.4S, v22.4S // ...............................................................*................................................................................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - add v21.4S, v10.4S, v9.4S // .............................................................*................................................................................................... - sqrdmulh v7.4S, v18.4S, v22.4S // ..........................................................*...................................................................................................... - sqrdmulh v4.4S, v23.4S, v28.4S // .............................................................................*................................................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - sqrdmulh v22.4S, v30.4S, v2.4S // ..................................................*.............................................................................................................. - mls v0.4S, v20.4S, v8.S[0] // ......................................................................*.......................................................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - sub v19.4S, v14.4S, v25.4S // ....................................................*............................................................................................................ - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mls v17.4S, v7.4S, v8.S[0] // .....................................................................*........................................................................................... - add v2.4S, v1.4S, v29.4S // ..................................*.............................................................................................................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mul v29.4S, v19.4S, v11.4S // .......................................................*......................................................................................................... - sqrdmulh v16.4S, v19.4S, v31.4S // ...........................................................*..................................................................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mls v5.4S, v22.4S, v8.S[0] // ............................................................*.................................................................................................... - add v31.4S, v24.4S, v26.4S // ..................................................................................*.............................................................................. - trn1 v24.4S, v17.4S, v0.4S // ...............................................................................*................................................................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - trn1 v10.4S, v2.4S, v21.4S // ..................................................................*.............................................................................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - trn2 v11.4S, v2.4S, v21.4S // .......................................................................*......................................................................................... - trn2 v14.4S, v17.4S, v0.4S // ..............................................................................*.................................................................................. - // gap // ................................................................................................................................................................. - ldr q9, [x4, #-32] // ....................................................................*............................................................................................ - trn2 v25.2D, v10.2D, v24.2D // ...................................................................................*............................................................................. - trn1 v30.2D, v10.2D, v24.2D // .........................................................................................*....................................................................... - // gap // ................................................................................................................................................................. - trn1 v6.2D, v11.2D, v14.2D // .....................................................................................*........................................................................... - // gap // ................................................................................................................................................................. - trn2 v7.2D, v11.2D, v14.2D // .................................................................................*............................................................................... - // gap // ................................................................................................................................................................. - mul v20.4S, v23.4S, v12.4S // ................................................................................*................................................................................ - // gap // ................................................................................................................................................................. - mls v29.4S, v16.4S, v8.S[0] // .................................................................*............................................................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - add v14.4S, v25.4S, v7.4S // ........................................................................................*........................................................................ - add v10.4S, v30.4S, v6.4S // ..............................................................................................*.................................................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - sub v24.4S, v25.4S, v7.4S // .......................................................................................*......................................................................... - sub v26.4S, v30.4S, v6.4S // ................................................................................................*................................................................ - add v13.4S, v10.4S, v14.4S // ...................................................................................................*............................................................. - sub v21.4S, v10.4S, v14.4S // ........................................................................................................................*........................................ - // gap // ................................................................................................................................................................. - ldr q6, [x4, #-16] // ..............................................................*.................................................................................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - sqrdmulh v17.4S, v26.4S, v27.S[3] // .........................................................................................................*....................................................... - sub v14.4S, v5.4S, v29.4S // ........................................................................*........................................................................................ - srshr v7.4S, v13.4S, #23 // .......................................................................................................................*......................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mul v18.4S, v26.4S, v27.S[2] // .......................................................................................................*......................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mul v10.4S, v14.4S, v12.4S // ............................................................................*.................................................................................... - sqrdmulh v14.4S, v14.4S, v28.4S // ...........................................................................*..................................................................................... - sqrdmulh v1.4S, v24.4S, v9.S[1] // .................................................................................................*............................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - add v30.4S, v5.4S, v29.4S // ..........................................................................*...................................................................................... - mul v12.4S, v24.4S, v9.S[0] // ............................................................................................*.................................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mls v18.4S, v17.4S, v8.S[0] // .................................................................................................................*............................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mls v10.4S, v14.4S, v8.S[0] // ....................................................................................*............................................................................ - mls v20.4S, v4.4S, v8.S[0] // ......................................................................................*.......................................................................... - mls v13.4S, v7.4S, v8.4S // .............................................................................................................................*................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - trn1 v26.4S, v31.4S, v30.4S // ...........................................................................................*..................................................................... - mls v12.4S, v1.4S, v8.S[0] // ...................................................................................................................*............................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - trn2 v30.4S, v31.4S, v30.4S // ..........................................................................................*...................................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - trn2 v14.4S, v20.4S, v10.4S // ...............................................................................................*................................................................. - trn1 v10.4S, v20.4S, v10.4S // .............................................................................................*................................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mul v19.4S, v21.4S, v3.S[2] // ............................................................................................................................*.................................... - sqrdmulh v29.4S, v21.4S, v3.S[3] // ...................................................................................................................................*............................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - sub v11.4S, v18.4S, v12.4S // .........................................................................................................................*....................................... - trn2 v20.2D, v30.2D, v14.2D // ......................................................................................................*.......................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - trn1 v14.2D, v30.2D, v14.2D // ....................................................................................................*............................................................ - trn1 v2.2D, v26.2D, v10.2D // ..................................................................................................*.............................................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - sqrdmulh v24.4S, v11.4S, v3.S[3] // ....................................................................................................................................*............................ - trn2 v15.2D, v26.2D, v10.2D // .....................................................................................................*........................................................... - add v23.4S, v2.4S, v14.4S // ..............................................................................................................*.................................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - sub v26.4S, v2.4S, v14.4S // ........................................................................................................*........................................................ - add v30.4S, v15.4S, v20.4S // ..........................................................................................................*...................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - sub v7.4S, v15.4S, v20.4S // ...........................................................................................................*..................................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - add v15.4S, v18.4S, v12.4S // ..................................................................................................................................*.............................. - mul v10.4S, v26.4S, v9.S[2] // .............................................................................................................*................................................... - add v5.4S, v23.4S, v30.4S // ..................................................................................................................*.............................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mul v25.4S, v7.4S, v6.S[0] // ................................................................................................................*................................................ - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - sqrdmulh v14.4S, v7.4S, v6.S[1] // ...............................................................................................................*................................................. - sqrdmulh v26.4S, v26.4S, v9.S[3] // ............................................................................................................*.................................................... - sub v18.4S, v23.4S, v30.4S // .....................................................................................................................*........................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - srshr v1.4S, v5.4S, #23 // ..........................................................................................................................*...................................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - srshr v31.4S, v15.4S, #23 // .......................................................................................................................................*......................... - mls v19.4S, v29.4S, v8.S[0] // .................................................................................................................................................*............... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mls v25.4S, v14.4S, v8.S[0] // ......................................................................................................................*.......................................... - mls v10.4S, v26.4S, v8.S[0] // ....................................................................................................................*............................................ - sqrdmulh v7.4S, v18.4S, v27.S[1] // ........................................................................................................................................*........................ - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mls v5.4S, v1.4S, v8.4S // ...............................................................................................................................*................................. - mul v30.4S, v18.4S, v27.S[0] // ...........................................................................................................................................*..................... - // gap // ................................................................................................................................................................. - mul v28.4S, v11.4S, v3.S[2] // ................................................................................................................................*................................ - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mls v15.4S, v31.4S, v8.4S // .............................................................................................................................................*................... - // gap // ................................................................................................................................................................. - add v2.4S, v10.4S, v25.4S // ..............................................................................................................................*.................................. - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - sub v29.4S, v10.4S, v25.4S // ...........................................................................................................................*..................................... - sub v0.4S, v13.4S, v5.4S // ......................................................................................................................................*.......................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mls v30.4S, v7.4S, v8.S[0] // ..................................................................................................................................................*.............. - srshr v10.4S, v2.4S, #23 // .....................................................................................................................................*........................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mul v11.4S, v0.4S, v3.S[0] // .........................................................................................................................................*....................... - sqrdmulh v7.4S, v0.4S, v3.S[1] // ..........................................................................................................................................*...................... - sqrdmulh v14.4S, v29.4S, v27.S[1] // .................................................................................................................................*............................... - // gap // ................................................................................................................................................................. - mul v23.4S, v29.4S, v27.S[0] // ..............................................................................................................................................*.................. - // gap // ................................................................................................................................................................. - sub v26.4S, v19.4S, v30.4S // .........................................................................................................................................................*....... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mls v2.4S, v10.4S, v8.4S // ............................................................................................................................................*.................... - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - mls v11.4S, v7.4S, v8.S[0] // ...............................................................................................................................................*................. - add v10.4S, v19.4S, v30.4S // ..........................................................................................................................................................*...... - // gap // ................................................................................................................................................................. - mls v23.4S, v14.4S, v8.S[0] // ...................................................................................................................................................*............. - mls v28.4S, v24.4S, v8.S[0] // ................................................................................................................................................*................ - // gap // ................................................................................................................................................................. - str q10, [x1, #32] // ................................................................................................................................................................* - // gap // ................................................................................................................................................................. - add v29.4S, v15.4S, v2.4S // ....................................................................................................................................................*............ - // gap // ................................................................................................................................................................. - // gap // ................................................................................................................................................................. - str q11, [x2], #(16*4) // ......................................................................................................................................................*.......... - mul v9.4S, v26.4S, v3.S[0] // ..............................................................................................................................................................*.. - add v31.4S, v13.4S, v5.4S // .....................................................................................................................................................*........... - add v14.4S, v28.4S, v23.4S // ............................................................................................................................................................*.... - // gap // ................................................................................................................................................................. - sub v20.4S, v15.4S, v2.4S // .......................................................................................................................................................*......... - str q29, [x1, #16] // ........................................................................................................................................................*........ - str q31, [x1], #(16*4) // ...........................................................................................................................................................*..... - // gap // ................................................................................................................................................................. - sqrdmulh v30.4S, v26.4S, v3.S[1] // ...............................................................................................................................................................*. - sub v17.4S, v28.4S, v23.4S // .............................................................................................................................................................*... - - // original source code - // ldr q18, [x1, #48] // ...*............................................................................................................................................................. - // ldr q23, [x1, #32] // ..*.............................................................................................................................................................. - // ldr q27, [x1, #0] // .*............................................................................................................................................................... - // ldr q14, [x1, #16] // *................................................................................................................................................................ - // ldr q4, [x5, #176] // .........................................*....................................................................................................................... - // ldr q2, [x5, #80] // ...................*............................................................................................................................................. - // ldr q26, [x5, #128] // ......*.......................................................................................................................................................... - // ldr q11, [x5, #112] // ............................*.................................................................................................................................... - // trn2 v13.4S, v23.4S, v18.4S // ..............*.................................................................................................................................................. - // ldr q17, [x5, #16] // ..........................................*...................................................................................................................... - // trn1 v30.4S, v23.4S, v18.4S // ............*.................................................................................................................................................... - // ldr q24, [x5, #160] // .......*......................................................................................................................................................... - // trn1 v20.4S, v27.4S, v14.4S // ...........*..................................................................................................................................................... - // trn2 v14.4S, v27.4S, v14.4S // ..........*...................................................................................................................................................... - // ldr q6, [x5, #144] // ........*........................................................................................................................................................ - // ldr q31, [x2, #32] // .............*................................................................................................................................................... - // ldr q22, [x2, #48] // .........*....................................................................................................................................................... - // trn2 v5.2D, v14.2D, v13.2D // ....................*............................................................................................................................................ - // trn2 v3.2D, v20.2D, v30.2D // ..................*.............................................................................................................................................. - // ldr q29, [x2, #16] // .....*........................................................................................................................................................... - // trn1 v18.2D, v14.2D, v13.2D // ........................*........................................................................................................................................ - // ldr q25, [x2, #0] // ....*............................................................................................................................................................ - // trn1 v20.2D, v20.2D, v30.2D // .........................*....................................................................................................................................... - // ldr q16, [x5, #48] // .......................*......................................................................................................................................... - // sub v1.4S, v3.4S, v5.4S // ...........................*..................................................................................................................................... - // add v23.4S, v3.4S, v5.4S // ...............................*................................................................................................................................. - // ldr q27, [x5, #96] // .............................*................................................................................................................................... - // sub v19.4S, v20.4S, v18.4S // ................................*................................................................................................................................ - // add v21.4S, v20.4S, v18.4S // ...................................*............................................................................................................................. - // ldr q10, [x5, #64] // .....................*........................................................................................................................................... - // trn2 v15.4S, v31.4S, v22.4S // ..........................*...................................................................................................................................... - // ldr q18, [x5, #32] // .................*............................................................................................................................................... - // trn1 v20.4S, v31.4S, v22.4S // .....................................*........................................................................................................................... - // ldr q3, [x4], #64 // ..................................*.............................................................................................................................. - // add v30.4S, v21.4S, v23.4S // .................................................................*............................................................................................... - // trn2 v9.4S, v25.4S, v29.4S // ...............*................................................................................................................................................. - // trn1 v12.4S, v25.4S, v29.4S // ................*................................................................................................................................................ - // sqrdmulh v0.4S, v19.4S, v16.4S // .......................................*......................................................................................................................... - // trn1 v28.2D, v9.2D, v15.2D // ...........................................*..................................................................................................................... - // mul v29.4S, v1.4S, v10.4S // .................................*............................................................................................................................... - // ldr q22, [x5], #(12*16) // ..............................*.................................................................................................................................. - // trn1 v5.2D, v12.2D, v20.2D // ........................................*........................................................................................................................ - // mul v14.4S, v19.4S, v18.4S // ......................................*.......................................................................................................................... - // trn2 v7.2D, v9.2D, v15.2D // ...............................................*................................................................................................................. - // sqrdmulh v16.4S, v1.4S, v2.4S // ....................................*............................................................................................................................ - // trn2 v13.2D, v12.2D, v20.2D // .............................................*................................................................................................................... - // sub v1.4S, v5.4S, v28.4S // .................................................*............................................................................................................... - // sub v12.4S, v21.4S, v23.4S // ................................................*................................................................................................................ - // mls v14.4S, v0.4S, v8.S[0] // ..............................................*.................................................................................................................. - // mls v29.4S, v16.4S, v8.S[0] // ............................................*.................................................................................................................... - // sqrdmulh v20.4S, v1.4S, v6.4S // .............................................................*................................................................................................... - // mul v19.4S, v12.4S, v22.4S // .......................................................*......................................................................................................... - // sub v10.4S, v13.4S, v7.4S // ...............................................................*................................................................................................. - // add v18.4S, v5.4S, v28.4S // ...................................................*............................................................................................................. - // mul v5.4S, v1.4S, v26.4S // .....................................................*........................................................................................................... - // mul v9.4S, v10.4S, v24.4S // ..................................................................*.............................................................................................. - // ldr q6, [x4, #-48] // ......................*.......................................................................................................................................... - // sub v15.4S, v14.4S, v29.4S // ....................................................*............................................................................................................ - // sqrdmulh v31.4S, v12.4S, v17.4S // ...........................................................*..................................................................................................... - // sqrdmulh v4.4S, v10.4S, v4.4S // ...................................................................*............................................................................................. - // mls v5.4S, v20.4S, v8.S[0] // ....................................................................*............................................................................................ - // add v21.4S, v14.4S, v29.4S // ..........................................................*...................................................................................................... - // ldr q23, [x4, #-16] // .......................................................................................*......................................................................... - // sqrdmulh v24.4S, v15.4S, v17.4S // .........................................................*....................................................................................................... - // mul v29.4S, v15.4S, v22.4S // ........................................................*........................................................................................................ - // mls v9.4S, v4.4S, v8.S[0] // ................................................................................*................................................................................ - // trn1 v14.4S, v30.4S, v21.4S // .......................................................................*......................................................................................... - // add v12.4S, v13.4S, v7.4S // ..................................................*.............................................................................................................. - // ldr q25, [x4, #-32] // ..........................................................................*...................................................................................... - // mls v19.4S, v31.4S, v8.S[0] // ................................................................*................................................................................................ - // mls v29.4S, v24.4S, v8.S[0] // ..............................................................*.................................................................................................. - // trn2 v0.4S, v30.4S, v21.4S // ........................................................................*........................................................................................ - // sub v2.4S, v5.4S, v9.4S // .........................................................................................*....................................................................... - // sub v30.4S, v18.4S, v12.4S // ......................................................*.......................................................................................................... - // add v22.4S, v5.4S, v9.4S // ...............................................................................................*................................................................. - // sqrdmulh v13.4S, v2.4S, v11.4S // .............................................................................................*................................................................... - // mul v5.4S, v2.4S, v27.4S // ............................................................................................*.................................................................... - // sqrdmulh v26.4S, v30.4S, v11.4S // ............................................................*.................................................................................................... - // trn2 v16.4S, v19.4S, v29.4S // .........................................................................*....................................................................................... - // trn1 v20.4S, v19.4S, v29.4S // ......................................................................*.......................................................................................... - // mul v2.4S, v30.4S, v27.4S // ...............................................................................*................................................................................. - // trn2 v28.2D, v0.2D, v16.2D // ..............................................................................*.................................................................................. - // add v19.4S, v18.4S, v12.4S // .....................................................................*........................................................................................... - // trn2 v9.2D, v14.2D, v20.2D // ...........................................................................*..................................................................................... - // mls v5.4S, v13.4S, v8.S[0] // ..................................................................................................*.............................................................. - // trn1 v29.2D, v0.2D, v16.2D // .............................................................................*................................................................................... - // mls v2.4S, v26.4S, v8.S[0] // ...................................................................................................*............................................................. - // sub v27.4S, v9.4S, v28.4S // ...................................................................................*............................................................................. - // add v26.4S, v9.4S, v28.4S // .................................................................................*............................................................................... - // trn1 v21.2D, v14.2D, v20.2D // ............................................................................*.................................................................................... - // trn2 v10.4S, v19.4S, v22.4S // .......................................................................................................*......................................................... - // trn1 v14.4S, v19.4S, v22.4S // .....................................................................................................*........................................................... - // mul v9.4S, v27.4S, v25.S[0] // ................................................................................................*................................................................ - // trn1 v22.4S, v2.4S, v5.4S // .........................................................................................................*....................................................... - // add v16.4S, v21.4S, v29.4S // ..................................................................................*.............................................................................. - // trn2 v28.4S, v2.4S, v5.4S // ........................................................................................................*........................................................ - // sub v15.4S, v21.4S, v29.4S // ....................................................................................*............................................................................ - // sqrdmulh v21.4S, v27.4S, v25.S[1] // ..............................................................................................*.................................................................. - // trn1 v20.2D, v14.2D, v22.2D // ...............................................................................................................*................................................. - // add v17.4S, v16.4S, v26.4S // .....................................................................................*........................................................................... - // trn1 v2.2D, v10.2D, v28.2D // ..............................................................................................................*.................................................. - // trn2 v22.2D, v14.2D, v22.2D // .................................................................................................................*............................................... - // trn2 v14.2D, v10.2D, v28.2D // .............................................................................................................*................................................... - // mul v29.4S, v15.4S, v6.S[2] // ...........................................................................................*..................................................................... - // sub v13.4S, v20.4S, v2.4S // ...................................................................................................................*............................................. - // sqrdmulh v11.4S, v15.4S, v6.S[3] // ........................................................................................*........................................................................ - // add v4.4S, v22.4S, v14.4S // ....................................................................................................................*............................................ - // sub v5.4S, v22.4S, v14.4S // .....................................................................................................................*........................................... - // sqrdmulh v31.4S, v13.4S, v25.S[3] // ...........................................................................................................................*..................................... - // mul v0.4S, v13.4S, v25.S[2] // .......................................................................................................................*......................................... - // add v12.4S, v20.4S, v2.4S // ..................................................................................................................*.............................................. - // sqrdmulh v28.4S, v5.4S, v23.S[1] // ..........................................................................................................................*...................................... - // mul v15.4S, v5.4S, v23.S[0] // .........................................................................................................................*....................................... - // mls v29.4S, v11.4S, v8.S[0] // .................................................................................................*............................................................... - // add v13.4S, v12.4S, v4.4S // ........................................................................................................................*........................................ - // mls v9.4S, v21.4S, v8.S[0] // ......................................................................................................*.......................................................... - // mls v0.4S, v31.4S, v8.S[0] // .................................................................................................................................*............................... - // sub v21.4S, v12.4S, v4.4S // ............................................................................................................................*.................................... - // mls v15.4S, v28.4S, v8.S[0] // ................................................................................................................................*................................ - // srshr v30.4S, v17.4S, #23 // ..........................................................................................*...................................................................... - // sub v14.4S, v16.4S, v26.4S // ......................................................................................*.......................................................................... - // sub v10.4S, v29.4S, v9.4S // ............................................................................................................*.................................................... - // srshr v7.4S, v13.4S, #23 // .............................................................................................................................*................................... - // sub v16.4S, v0.4S, v15.4S // ........................................................................................................................................*........................ - // mul v31.4S, v14.4S, v3.S[2] // ..........................................................................................................*...................................................... - // mls v17.4S, v30.4S, v8.4S // ....................................................................................................*............................................................ - // add v5.4S, v0.4S, v15.4S // .......................................................................................................................................*......................... - // mls v13.4S, v7.4S, v8.4S // ...................................................................................................................................*............................. - // mul v19.4S, v10.4S, v3.S[2] // .....................................................................................................................................*........................... - // sqrdmulh v4.4S, v16.4S, v6.S[1] // ..............................................................................................................................................*.................. - // add v27.4S, v29.4S, v9.4S // ......................................................................................................................*.......................................... - // sqrdmulh v9.4S, v14.4S, v3.S[3] // ...........................................................................................................*..................................................... - // sqrdmulh v10.4S, v10.4S, v3.S[3] // ................................................................................................................*................................................ - // srshr v1.4S, v5.4S, #23 // ...........................................................................................................................................*..................... - // sub v12.4S, v17.4S, v13.4S // .........................................................................................................................................*....................... - // srshr v7.4S, v27.4S, #23 // ..............................................................................................................................*.................................. - // sqrdmulh v26.4S, v21.4S, v6.S[1] // ..................................................................................................................................*.............................. - // mul v18.4S, v12.4S, v3.S[0] // ............................................................................................................................................*.................... - // sqrdmulh v14.4S, v12.4S, v3.S[1] // .............................................................................................................................................*................... - // mul v0.4S, v21.4S, v6.S[0] // ....................................................................................................................................*............................ - // mls v5.4S, v1.4S, v8.4S // .................................................................................................................................................*............... - // mls v27.4S, v7.4S, v8.4S // ......................................................................................................................................*.......................... - // mul v1.4S, v16.4S, v6.S[0] // ...............................................................................................................................................*................. - // mls v18.4S, v14.4S, v8.S[0] // ..................................................................................................................................................*.............. - // mls v19.4S, v10.4S, v8.S[0] // .....................................................................................................................................................*........... - // mls v31.4S, v9.4S, v8.S[0] // ...............................................................................................................................*................................. - // mls v0.4S, v26.4S, v8.S[0] // ..........................................................................................................................................*...................... - // mls v1.4S, v4.4S, v8.S[0] // ....................................................................................................................................................*............ - // add v22.4S, v27.4S, v5.4S // .......................................................................................................................................................*......... - // add v28.4S, v17.4S, v13.4S // ..........................................................................................................................................................*...... - // str q18, [x2], #(16*4) // ........................................................................................................................................................*........ - // sub v20.4S, v27.4S, v5.4S // ............................................................................................................................................................*.... - // str q22, [x1, #16] // .............................................................................................................................................................*... - // sub v10.4S, v31.4S, v0.4S // ................................................................................................................................................*................ - // add v5.4S, v31.4S, v0.4S // ...................................................................................................................................................*............. - // str q28, [x1], #(16*4) // ..............................................................................................................................................................*.. - // add v14.4S, v19.4S, v1.4S // ...........................................................................................................................................................*..... - // sub v17.4S, v19.4S, v1.4S // ................................................................................................................................................................* - // mul v9.4S, v10.4S, v3.S[0] // .........................................................................................................................................................*....... - // sqrdmulh v30.4S, v10.4S, v3.S[1] // ...............................................................................................................................................................*. - // str q5, [x1, #-32] // ......................................................................................................................................................*.......... + // Instructions: 162 + // Expected cycles: 71 + // Expected IPC: 2.28 + // + // Wall time: 162.16s + // User time: 162.16s + // + // ----------------------------------------------------------------------- original position -----------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------- + ldr q25, [x2, #16] // ......................*........................................................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + ldr q22, [x2, #0] // ........................*......................................................................................................................................... + ldr q30, [x2, #32] // .........................*........................................................................................................................................ + ldr q9, [x2, #48] // ...............................*.................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + ldr q23, [x1, #32] // ..*............................................................................................................................................................... + // gap // .................................................................................................................................................................. + ldr q10, [x1, #48] // ...*.............................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + ldr q24, [x1, #16] // .*................................................................................................................................................................ + ldr q20, [x1, #0] // *................................................................................................................................................................. + // gap // .................................................................................................................................................................. + trn2 v3.4S, v22.4S, v25.4S // ......................................*........................................................................................................................... + ldr q0, [x4, #16] // ........................................................................*......................................................................................... + trn1 v31.4S, v22.4S, v25.4S // ....................................*............................................................................................................................. + ldr q1, [x5, #144] // ..................*............................................................................................................................................... + ldr q16, [x4], #64 // ............................................................................................*..................................................................... + trn2 v29.4S, v30.4S, v9.4S // ..........................................*....................................................................................................................... + // gap // .................................................................................................................................................................. + trn1 v28.4S, v30.4S, v9.4S // ............................................*..................................................................................................................... + ldr q6, [x4, #-16] // .....................................................................*............................................................................................ + trn2 v25.4S, v23.4S, v10.4S // .......*.......................................................................................................................................................... + // gap // .................................................................................................................................................................. + trn1 v21.4S, v23.4S, v10.4S // ........*......................................................................................................................................................... + trn2 v10.2D, v3.2D, v29.2D // ..............................................*................................................................................................................... + ldr q11, [x5, #96] // .........................................*........................................................................................................................ + trn2 v14.4S, v20.4S, v24.4S // ....*............................................................................................................................................................. + ldr q5, [x5, #160] // ...........*...................................................................................................................................................... + trn1 v17.2D, v3.2D, v29.2D // ................................................*................................................................................................................. + ldr q7, [x5, #176] // ................................*................................................................................................................................. + // gap // .................................................................................................................................................................. + trn2 v26.2D, v31.2D, v28.2D // .................................................*................................................................................................................ + trn1 v12.2D, v14.2D, v25.2D // ................*................................................................................................................................................. + trn1 v2.2D, v31.2D, v28.2D // ....................................................*............................................................................................................. + ldr q31, [x5, #128] // .........*........................................................................................................................................................ + // gap // .................................................................................................................................................................. + ldr q23, [x5, #112] // ........................................*......................................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sub v13.4S, v26.4S, v10.4S // .....................................................*............................................................................................................ + sub v19.4S, v2.4S, v17.4S // .........................................................*........................................................................................................ + ldr q29, [x5], #(12*16) // ...................................*.............................................................................................................................. + trn1 v28.4S, v20.4S, v24.4S // .....*............................................................................................................................................................ + ldr q15, [x5, #-128] // ......*........................................................................................................................................................... + ldr q27, [x5, #-144] // ..........*....................................................................................................................................................... + // gap // .................................................................................................................................................................. + mul v9.4S, v13.4S, v5.4S // ................................................................*................................................................................................. + sqrdmulh v20.4S, v13.4S, v7.4S // ............................................................*..................................................................................................... + sqrdmulh v1.4S, v19.4S, v1.4S // .................................................................*................................................................................................ + mul v24.4S, v19.4S, v31.4S // .............................................................*.................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + add v13.4S, v2.4S, v17.4S // ..........................................................................*....................................................................................... + trn1 v19.2D, v28.2D, v21.2D // .................*................................................................................................................................................ + // gap // .................................................................................................................................................................. + ldr q2, [x5, #-160] // ..............*................................................................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + add v22.4S, v26.4S, v10.4S // ......................................................................*........................................................................................... + mls v9.4S, v20.4S, v8.S[0] // ...........................................................................*...................................................................................... + mls v24.4S, v1.4S, v8.S[0] // .........................................................................*........................................................................................ + sub v17.4S, v19.4S, v12.4S // .....................*............................................................................................................................................ + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + add v26.4S, v13.4S, v22.4S // ..............................................................................*................................................................................... + sub v10.4S, v13.4S, v22.4S // ...............................................................................*.................................................................................. + sqrdmulh v18.4S, v17.4S, v27.4S // ............................*..................................................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + trn2 v31.2D, v14.2D, v25.2D // ............*..................................................................................................................................................... + sub v4.4S, v24.4S, v9.4S // .................................................................................*................................................................................ + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v27.4S, v17.4S, v2.4S // .............................*.................................................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sqrdmulh v30.4S, v10.4S, v23.4S // ..................................................................................*............................................................................... + mul v20.4S, v10.4S, v11.4S // ...................................................................................*.............................................................................. + mul v10.4S, v4.4S, v11.4S // .....................................................................................*............................................................................ + sqrdmulh v14.4S, v4.4S, v23.4S // ....................................................................................*............................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mls v27.4S, v18.4S, v8.S[0] // .......................................*.......................................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + trn2 v3.2D, v28.2D, v21.2D // .............*.................................................................................................................................................... + mls v20.4S, v30.4S, v8.S[0] // .........................................................................................*........................................................................ + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + add v22.4S, v24.4S, v9.4S // ................................................................................*................................................................................. + // gap // .................................................................................................................................................................. + ldr q17, [x5, #-112] // ...............*.................................................................................................................................................. + sub v4.4S, v3.4S, v31.4S // ...................*.............................................................................................................................................. + mls v10.4S, v14.4S, v8.S[0] // ...........................................................................................*...................................................................... + trn1 v28.4S, v26.4S, v22.4S // ........................................................................................*......................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + trn2 v26.4S, v26.4S, v22.4S // ......................................................................................*........................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v25.4S, v4.4S, v15.4S // ..........................*....................................................................................................................................... + add v5.4S, v3.4S, v31.4S // ....................*............................................................................................................................................. + trn2 v1.4S, v20.4S, v10.4S // .................................................................................................*................................................................ + trn1 v18.4S, v20.4S, v10.4S // ..................................................................................................*............................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sqrdmulh v7.4S, v4.4S, v17.4S // ...........................*...................................................................................................................................... + add v17.4S, v19.4S, v12.4S // .......................*.......................................................................................................................................... + trn2 v19.2D, v28.2D, v18.2D // ......................................................................................................*........................................................... + trn2 v14.2D, v26.2D, v1.2D // .....................................................................................................*............................................................ + // gap // .................................................................................................................................................................. + ldr q31, [x5, #-176] // ..............................*................................................................................................................................... + trn1 v30.2D, v26.2D, v1.2D // .......................................................................................................*.......................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sub v10.4S, v17.4S, v5.4S // .................................*................................................................................................................................ + add v22.4S, v19.4S, v14.4S // ...............................................................................................................*.................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mls v25.4S, v7.4S, v8.S[0] // .....................................*............................................................................................................................ + ldr q26, [x4, #-32] // ....................................................................*............................................................................................. + // gap // .................................................................................................................................................................. + trn1 v12.2D, v28.2D, v18.2D // ........................................................................................................*......................................................... + mul v11.4S, v10.4S, v29.4S // .............................................*.................................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sqrdmulh v21.4S, v10.4S, v31.4S // ...........................................*...................................................................................................................... + add v20.4S, v17.4S, v5.4S // ..................................*............................................................................................................................... + // gap // .................................................................................................................................................................. + add v10.4S, v12.4S, v30.4S // .................................................................................................................*................................................ + // gap // .................................................................................................................................................................. + sub v4.4S, v27.4S, v25.4S // ...............................................*.................................................................................................................. + sub v2.4S, v12.4S, v30.4S // ...........................................................................................................*...................................................... + // gap // .................................................................................................................................................................. + add v17.4S, v27.4S, v25.4S // .......................................................*.......................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v1.4S, v4.4S, v29.4S // ..................................................*............................................................................................................... + sqrdmulh v28.4S, v4.4S, v31.4S // ...................................................*.............................................................................................................. + trn2 v15.4S, v20.4S, v17.4S // ...........................................................*...................................................................................................... + // gap // .................................................................................................................................................................. + sub v12.4S, v10.4S, v22.4S // .....................................................................................................................*............................................ + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sqrdmulh v30.4S, v2.4S, v26.S[3] // ...................................................................................................................*.............................................. + // gap // .................................................................................................................................................................. + add v7.4S, v10.4S, v22.4S // ........................................................................................................................*......................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mls v1.4S, v28.4S, v8.S[0] // ........................................................*......................................................................................................... + mls v11.4S, v21.4S, v8.S[0] // ......................................................*........................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v5.4S, v12.4S, v0.S[0] // ...........................................................................................................................*...................................... + srshr v31.4S, v7.4S, #23 // ............................................................................................................................*..................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sqrdmulh v28.4S, v12.4S, v0.S[1] // ..............................................................................................................................*................................... + sub v19.4S, v19.4S, v14.4S // ..........................................................................................................*....................................................... + trn1 v27.4S, v11.4S, v1.4S // ..............................................................*................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + trn1 v12.4S, v20.4S, v17.4S // ..........................................................*....................................................................................................... + mul v13.4S, v2.4S, v26.S[2] // ................................................................................................................*................................................. + trn2 v24.4S, v11.4S, v1.4S // ...............................................................*.................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sqrdmulh v17.4S, v19.4S, v6.S[1] // ..............................................................................................................*................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + trn2 v4.2D, v12.2D, v27.2D // .......................................................................................*.......................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v18.4S, v19.4S, v6.S[0] // .............................................................................................................*.................................................... + trn2 v2.2D, v15.2D, v24.2D // ............................................................................*..................................................................................... + mls v13.4S, v30.4S, v8.S[0] // .........................................................................................................................*........................................ + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + trn1 v14.2D, v15.2D, v24.2D // ...................................................................*.............................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sub v10.4S, v4.4S, v2.4S // ..........................................................................................*....................................................................... + trn1 v25.2D, v12.2D, v27.2D // ..................................................................*............................................................................................... + mls v18.4S, v17.4S, v8.S[0] // .......................................................................................................................*.......................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mls v5.4S, v28.4S, v8.S[0] // ..............................................................................................................................................*................... + // gap // .................................................................................................................................................................. + sub v20.4S, v25.4S, v14.4S // .......................................................................*.......................................................................................... + // gap // .................................................................................................................................................................. + mul v15.4S, v10.4S, v26.S[0] // ...................................................................................................*.............................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + add v30.4S, v25.4S, v14.4S // .............................................................................*.................................................................................... + sqrdmulh v12.4S, v10.4S, v26.S[1] // ................................................................................................*................................................................. + sqrdmulh v23.4S, v20.4S, v0.S[3] // ..............................................................................................*................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v1.4S, v20.4S, v0.S[2] // .............................................................................................*.................................................................... + // gap // .................................................................................................................................................................. + add v17.4S, v4.4S, v2.4S // ...............................................................................................*.................................................................. + // gap // .................................................................................................................................................................. + add v4.4S, v13.4S, v18.4S // ...............................................................................................................................*.................................. + sub v6.4S, v13.4S, v18.4S // .................................................................................................................................*................................ + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mls v7.4S, v31.4S, v8.4S // .....................................................................................................................................*............................ + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mls v15.4S, v12.4S, v8.S[0] // ............................................................................................................*..................................................... + mls v1.4S, v23.4S, v8.S[0] // ....................................................................................................*............................................................. + sqrdmulh v24.4S, v6.4S, v0.S[1] // ........................................................................................................................................*......................... + sub v23.4S, v30.4S, v17.4S // ......................................................................................................................*........................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v10.4S, v6.4S, v0.S[0] // .......................................................................................................................................*.......................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + srshr v28.4S, v4.4S, #23 // ...................................................................................................................................*.............................. + sqrdmulh v18.4S, v23.4S, v16.S[3] // ............................................................................................................................................*..................... + add v2.4S, v1.4S, v15.4S // ....................................................................................................................*............................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v21.4S, v23.4S, v16.S[2] // ..........................................................................................................................................*....................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sub v12.4S, v1.4S, v15.4S // ................................................................................................................................*................................. + add v26.4S, v30.4S, v17.4S // .........................................................................................................*........................................................ + srshr v30.4S, v2.4S, #23 // ..........................................................................................................................*....................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v0.4S, v12.4S, v16.S[2] // ......................................................................................................................................*........................... + sqrdmulh v13.4S, v12.4S, v16.S[3] // ....................................................................................................................................*............................. + mls v21.4S, v18.4S, v8.S[0] // .................................................................................................................................................*................ + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + srshr v12.4S, v26.4S, #23 // ..................................................................................................................*............................................... + mls v4.4S, v28.4S, v8.4S // .........................................................................................................................................*........................ + mls v2.4S, v30.4S, v8.4S // ..................................................................................................................................*............................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mls v10.4S, v24.4S, v8.S[0] // .............................................................................................................................................*.................... + mls v0.4S, v13.4S, v8.S[0] // ...........................................................................................................................................*...................... + add v18.4S, v21.4S, v5.4S // ........................................................................................................................................................*......... + // gap // .................................................................................................................................................................. + mls v26.4S, v12.4S, v8.4S // .............................................................................................................................*.................................... + // gap // .................................................................................................................................................................. + sub v31.4S, v2.4S, v4.4S // ...............................................................................................................................................*.................. + // gap // .................................................................................................................................................................. + add v30.4S, v2.4S, v4.4S // ..................................................................................................................................................*............... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + add v3.4S, v0.4S, v10.4S // ...................................................................................................................................................*.............. + sub v12.4S, v0.4S, v10.4S // ....................................................................................................................................................*............. + sqrdmulh v13.4S, v31.4S, v16.S[1] // ......................................................................................................................................................*........... + str q30, [x1, #16] // .........................................................................................................................................................*........ + mul v25.4S, v31.4S, v16.S[0] // .......................................................................................................................................................*.......... + // gap // .................................................................................................................................................................. + add v10.4S, v26.4S, v7.4S // ................................................................................................................................................*................. + // gap // .................................................................................................................................................................. + sub v1.4S, v26.4S, v7.4S // ............................................................................................................................................................*..... + str q18, [x1, #32] // ...............................................................................................................................................................*.. + // gap // .................................................................................................................................................................. + sqrdmulh v27.4S, v12.4S, v16.S[1] // ..........................................................................................................................................................*....... + mul v11.4S, v12.4S, v16.S[0] // .............................................................................................................................................................*.... + str q3, [x1, #48] // ...........................................................................................................................................................*...... + str q10, [x1], #(16*4) // .....................................................................................................................................................*............ + mls v25.4S, v13.4S, v8.S[0] // ..............................................................................................................................................................*... + sub v23.4S, v21.4S, v5.4S // .................................................................................................................................................................* + add x1, x1, #64 // ................................................................................................................................................................*. + + // ------------------------------------------------------------------------- new position --------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------- + // ldr q15, [x1, #0] // .......*.......................................................................................................................................................... + // ldr q20, [x1, #16] // ......*........................................................................................................................................................... + // ldr q19, [x1, #32] // ....*............................................................................................................................................................. + // ldr q13, [x1, #48] // .....*............................................................................................................................................................ + // trn2 v25.4S, v15.4S, v20.4S // ....................*............................................................................................................................................. + // trn1 v17.4S, v15.4S, v20.4S // ................................*................................................................................................................................. + // ldr q9, [x5, #64] // .................................*................................................................................................................................ + // trn2 v11.4S, v19.4S, v13.4S // ................*................................................................................................................................................. + // trn1 v31.4S, v19.4S, v13.4S // .................*................................................................................................................................................ + // ldr q21, [x5, #128] // ...........................*...................................................................................................................................... + // ldr q3, [x5, #48] // ..................................*............................................................................................................................... + // ldr q13, [x5, #160] // .....................*............................................................................................................................................ + // trn2 v26.2D, v25.2D, v11.2D // .................................................*................................................................................................................ + // trn2 v20.2D, v17.2D, v31.2D // .........................................................*........................................................................................................ + // ldr q24, [x5, #32] // .........................................*........................................................................................................................ + // ldr q23, [x5, #80] // ............................................................*..................................................................................................... + // trn1 v27.2D, v25.2D, v11.2D // .........................*........................................................................................................................................ + // trn1 v11.2D, v17.2D, v31.2D // ........................................*......................................................................................................................... + // ldr q16, [x5, #144] // ...........*...................................................................................................................................................... + // sub v18.4S, v20.4S, v26.4S // .............................................................*.................................................................................................... + // add v25.4S, v20.4S, v26.4S // ..................................................................*............................................................................................... + // sub v14.4S, v11.4S, v27.4S // .............................................*.................................................................................................................... + // ldr q26, [x2, #16] // *................................................................................................................................................................. + // add v20.4S, v11.4S, v27.4S // ......................................................................*........................................................................................... + // ldr q0, [x2, #0] // .*................................................................................................................................................................ + // ldr q17, [x2, #32] // ..*............................................................................................................................................................... + // mul v27.4S, v18.4S, v9.4S // .................................................................*................................................................................................ + // sqrdmulh v7.4S, v18.4S, v23.4S // .....................................................................*............................................................................................ + // sqrdmulh v11.4S, v14.4S, v3.4S // ................................................*................................................................................................................. + // mul v10.4S, v14.4S, v24.4S // ...................................................*.............................................................................................................. + // ldr q1, [x5, #16] // .........................................................................*........................................................................................ + // ldr q24, [x2, #48] // ...*.............................................................................................................................................................. + // ldr q23, [x5, #176] // .......................*.......................................................................................................................................... + // sub v18.4S, v20.4S, v25.4S // ...........................................................................*...................................................................................... + // add v3.4S, v20.4S, v25.4S // ..................................................................................*............................................................................... + // ldr q20, [x5], #(12*16) // ...............................*.................................................................................................................................. + // trn1 v15.4S, v0.4S, v26.4S // ..........*....................................................................................................................................................... + // mls v27.4S, v7.4S, v8.S[0] // .............................................................................*.................................................................................... + // trn2 v0.4S, v0.4S, v26.4S // ........*......................................................................................................................................................... + // mls v10.4S, v11.4S, v8.S[0] // ........................................................*......................................................................................................... + // ldr q14, [x5, #-80] // ............................*..................................................................................................................................... + // ldr q5, [x5, #-96] // ...................*.............................................................................................................................................. + // trn2 v7.4S, v17.4S, v24.4S // .............*.................................................................................................................................................... + // sqrdmulh v26.4S, v18.4S, v1.4S // .................................................................................*................................................................................ + // trn1 v25.4S, v17.4S, v24.4S // ..............*................................................................................................................................................... + // mul v11.4S, v18.4S, v20.4S // ................................................................................*................................................................................. + // trn2 v9.2D, v0.2D, v7.2D // ..................*............................................................................................................................................... + // sub v24.4S, v10.4S, v27.4S // ....................................................................................*............................................................................. + // trn1 v31.2D, v0.2D, v7.2D // ......................*........................................................................................................................................... + // trn2 v18.2D, v15.2D, v25.2D // ........................*......................................................................................................................................... + // mul v7.4S, v24.4S, v20.4S // .......................................................................................*.......................................................................... + // sqrdmulh v24.4S, v24.4S, v1.4S // ........................................................................................*......................................................................... + // trn1 v1.2D, v15.2D, v25.2D // ..........................*....................................................................................................................................... + // sub v25.4S, v18.4S, v9.4S // .............................*.................................................................................................................................... + // mls v11.4S, v26.4S, v8.S[0] // ..............................................................................................*................................................................... + // add v20.4S, v10.4S, v27.4S // ......................................................................................*........................................................................... + // mls v7.4S, v24.4S, v8.S[0] // .............................................................................................*.................................................................... + // sub v0.4S, v1.4S, v31.4S // ..............................*................................................................................................................................... + // trn1 v15.4S, v3.4S, v20.4S // ....................................................................................................*............................................................. + // trn2 v24.4S, v3.4S, v20.4S // .........................................................................................*........................................................................ + // sqrdmulh v3.4S, v25.4S, v23.4S // ....................................*............................................................................................................................. + // mul v23.4S, v0.4S, v21.4S // ......................................*........................................................................................................................... + // trn1 v20.4S, v11.4S, v7.4S // ...................................................................................................*.............................................................. + // trn2 v7.4S, v11.4S, v7.4S // ......................................................................................................*........................................................... + // mul v10.4S, v25.4S, v13.4S // ...................................*.............................................................................................................................. + // sqrdmulh v26.4S, v0.4S, v16.4S // .....................................*............................................................................................................................ + // trn1 v11.2D, v15.2D, v20.2D // ..............................................................................................................*................................................... + // trn1 v13.2D, v24.2D, v7.2D // ............................................................................................................*..................................................... + // ldr q17, [x4, #32] // ..............................................................................*................................................................................... + // ldr q25, [x4, #48] // ...............*.................................................................................................................................................. + // add v9.4S, v18.4S, v9.4S // ..........................................*....................................................................................................................... + // sub v27.4S, v11.4S, v13.4S // .................................................................................................................*................................................ + // ldr q0, [x4, #16] // .........*........................................................................................................................................................ + // mls v23.4S, v26.4S, v8.S[0] // ............................................*..................................................................................................................... + // add v21.4S, v1.4S, v31.4S // .......................................*.......................................................................................................................... + // mls v10.4S, v3.4S, v8.S[0] // ...........................................*...................................................................................................................... + // trn2 v2.2D, v24.2D, v7.2D // ..........................................................................................................*....................................................... + // add v18.4S, v11.4S, v13.4S // ...................................................................................................................*.............................................. + // add v1.4S, v21.4S, v9.4S // ..............................................*................................................................................................................... + // sub v7.4S, v21.4S, v9.4S // ...............................................*.................................................................................................................. + // add v11.4S, v23.4S, v10.4S // ...........................................................*...................................................................................................... + // sub v24.4S, v23.4S, v10.4S // ..................................................*............................................................................................................... + // sqrdmulh v13.4S, v7.4S, v14.4S // ....................................................*............................................................................................................. + // mul v23.4S, v7.4S, v5.4S // .....................................................*............................................................................................................ + // sqrdmulh v7.4S, v24.4S, v14.4S // .......................................................*.......................................................................................................... + // mul v24.4S, v24.4S, v5.4S // ......................................................*........................................................................................................... + // trn2 v3.4S, v1.4S, v11.4S // ................................................................*................................................................................................. + // trn2 v9.2D, v15.2D, v20.2D // ........................................................................................................*......................................................... + // trn1 v20.4S, v1.4S, v11.4S // ...............................................................*.................................................................................................. + // mls v23.4S, v13.4S, v8.S[0] // ..........................................................*....................................................................................................... + // sub v11.4S, v9.4S, v2.4S // .............................................................................................................*.................................................... + // mls v24.4S, v7.4S, v8.S[0] // ..............................................................*................................................................................................... + // ldr q16, [x4], #64 // ............*..................................................................................................................................................... + // mul v10.4S, v27.4S, v0.S[2] // ......................................................................................................................*........................................... + // sqrdmulh v7.4S, v27.4S, v0.S[3] // .....................................................................................................................*............................................ + // add v9.4S, v9.4S, v2.4S // .......................................................................................................................*.......................................... + // sqrdmulh v27.4S, v11.4S, v17.S[1] // ....................................................................................................................*............................................. + // trn2 v1.4S, v23.4S, v24.4S // ...................................................................*.............................................................................................. + // trn1 v21.4S, v23.4S, v24.4S // ....................................................................*............................................................................................. + // mul v24.4S, v11.4S, v17.S[0] // ..................................................................................................................*............................................... + // mls v10.4S, v7.4S, v8.S[0] // ............................................................................................................................*..................................... + // trn2 v7.2D, v3.2D, v1.2D // ........................................................................*......................................................................................... + // trn2 v13.2D, v20.2D, v21.2D // .......................................................................*.......................................................................................... + // trn1 v26.2D, v3.2D, v1.2D // ..........................................................................*....................................................................................... + // trn1 v19.2D, v20.2D, v21.2D // ...............................................................................*.................................................................................. + // add v3.4S, v18.4S, v9.4S // .....................................................................................................................................*............................ + // sub v23.4S, v13.4S, v7.4S // ..................................................................................................*............................................................... + // sub v21.4S, v19.4S, v26.4S // .....................................................................................*............................................................................ + // mls v24.4S, v27.4S, v8.S[0] // ...........................................................................................................................*...................................... + // mul v11.4S, v23.4S, v25.S[0] // .........................................................................................................*........................................................ + // sqrdmulh v27.4S, v23.4S, v25.S[1] // .......................................................................................................*.......................................................... + // add v7.4S, v13.4S, v7.4S // ............................................................................*..................................................................................... + // mul v1.4S, v21.4S, v17.S[2] // .....................................................................................................*............................................................ + // add v20.4S, v19.4S, v26.4S // ...................................................................................*.............................................................................. + // srshr v13.4S, v3.4S, #23 // ..........................................................................................................................................*....................... + // sqrdmulh v21.4S, v21.4S, v17.S[3] // ...........................................................................................*...................................................................... + // add v15.4S, v10.4S, v24.4S // ..................................................................................................................................*............................... + // sub v25.4S, v20.4S, v7.4S // ..........................................................................................*....................................................................... + // sub v23.4S, v18.4S, v9.4S // ..............................................................................................................................*................................... + // mls v11.4S, v27.4S, v8.S[0] // ...............................................................................................................*.................................................. + // add v20.4S, v20.4S, v7.4S // ............................................................................................*..................................................................... + // mls v1.4S, v21.4S, v8.S[0] // ...........................................................................................................*...................................................... + // srshr v27.4S, v15.4S, #23 // ......................................................................................................................................*........................... + // mul v26.4S, v25.4S, v0.S[0] // ...............................................................................................*.................................................................. + // srshr v7.4S, v20.4S, #23 // ................................................................................................*................................................................. + // mls v3.4S, v13.4S, v8.4S // ................................................................................................................................................*................. + // sqrdmulh v25.4S, v25.4S, v0.S[1] // .................................................................................................*................................................................ + // add v9.4S, v1.4S, v11.4S // ........................................................................................................................*......................................... + // sub v24.4S, v10.4S, v24.4S // ....................................................................................................................................*............................. + // sub v1.4S, v1.4S, v11.4S // .........................................................................................................................*........................................ + // mls v15.4S, v27.4S, v8.4S // ............................................................................................................................................*..................... + // srshr v27.4S, v9.4S, #23 // ................................................................................................................................*................................. + // sqrdmulh v11.4S, v24.4S, v16.S[3] // ........................................................................................................................................*......................... + // mls v20.4S, v7.4S, v8.4S // ..........................................................................................................................*....................................... + // mul v13.4S, v24.4S, v16.S[2] // .......................................................................................................................................*.......................... + // mul v24.4S, v1.4S, v0.S[0] // ...............................................................................................................................*.................................. + // sqrdmulh v7.4S, v1.4S, v0.S[1] // .............................................................................................................................*.................................... + // mls v9.4S, v27.4S, v8.4S // ...........................................................................................................................................*...................... + // mul v10.4S, v23.4S, v16.S[2] // ...................................................................................................................................*.............................. + // mls v13.4S, v11.4S, v8.S[0] // ..............................................................................................................................................*................... + // sqrdmulh v23.4S, v23.4S, v16.S[3] // .................................................................................................................................*................................ + // mls v24.4S, v7.4S, v8.S[0] // .............................................................................................................................................*.................... + // mls v26.4S, v25.4S, v8.S[0] // ................................................................................................................*................................................. + // sub v11.4S, v15.4S, v9.4S // .................................................................................................................................................*................ + // add v27.4S, v3.4S, v20.4S // ........................................................................................................................................................*......... + // mls v10.4S, v23.4S, v8.S[0] // .........................................................................................................................................*........................ + // add v9.4S, v15.4S, v9.4S // ..................................................................................................................................................*............... + // add v23.4S, v13.4S, v24.4S // ...................................................................................................................................................*.............. + // sub v24.4S, v13.4S, v24.4S // ....................................................................................................................................................*............. + // str q27, [x1], #(16*4) // ..............................................................................................................................................................*... + // sqrdmulh v7.4S, v11.4S, v16.S[1] // .....................................................................................................................................................*............ + // mul v25.4S, v11.4S, v16.S[0] // .......................................................................................................................................................*.......... + // add v22.4S, v10.4S, v26.4S // ...............................................................................................................................................*.................. + // str q9, [x1, #-48] // ......................................................................................................................................................*........... + // sqrdmulh v27.4S, v24.4S, v16.S[1] // ...........................................................................................................................................................*...... + // str q23, [x1, #-16] // .............................................................................................................................................................*.... + // sub v1.4S, v3.4S, v20.4S // .........................................................................................................................................................*........ + // mul v11.4S, v24.4S, v16.S[0] // ............................................................................................................................................................*..... + // mls v25.4S, v7.4S, v8.S[0] // ...............................................................................................................................................................*.. + // str q22, [x1, #-32] // ..........................................................................................................................................................*....... + // add x1, x1, #64 // .................................................................................................................................................................* + // sub v23.4S, v10.4S, v26.4S // ................................................................................................................................................................*. sub count, count, #1 layer45678_start: - str q14, [x1, #-16] // .......................................................................................................................................................................*...... - add x1, x1, #64 // ............................................................................................................................................................................*. - sqrdmulh v0.4S, v17.4S, v3.S[1] // ..................................................................................................................................................................*........... - mul v19.4S, v17.4S, v3.S[0] // .................................................................................................................................................................*............ - ldr q18, [x1, #48] // ...e.......................................................................................................................................................................... - ldr q23, [x1, #32] // ..e........................................................................................................................................................................... - mul v1.4S, v20.4S, v3.S[0] // .......................................................................................................................................................*...................... - sqrdmulh v25.4S, v20.4S, v3.S[1] // ........................................................................................................................................................*..................... - // gap // .............................................................................................................................................................................. - mls v9.4S, v30.4S, v8.S[0] // ..............................................................................................................................................................*............... - ldr q27, [x1, #0] // e............................................................................................................................................................................. - ldr q14, [x1, #16] // .e............................................................................................................................................................................ - ldr q4, [x5, #176] // .......................................................e...................................................................................................................... - ldr q2, [x5, #80] // .............................e................................................................................................................................................ - mls v19.4S, v0.4S, v8.S[0] // ...................................................................................................................................................................*.......... - // gap // .............................................................................................................................................................................. - ldr q26, [x5, #128] // ....................................................e......................................................................................................................... - ldr q11, [x5, #112] // ...................................................e.......................................................................................................................... - mls v1.4S, v25.4S, v8.S[0] // .........................................................................................................................................................*.................... - // gap // .............................................................................................................................................................................. - trn2 v13.4S, v23.4S, v18.4S // .......e...................................................................................................................................................................... - ldr q17, [x5, #16] // .........................e.................................................................................................................................................... - trn1 v30.4S, v23.4S, v18.4S // ......e....................................................................................................................................................................... - str q9, [x2, #-32] // ..........................................................................................................................................................................*... - str q19, [x2, #-16] // ...........................................................................................................................................................................*.. - ldr q24, [x5, #160] // ......................................................e....................................................................................................................... - trn1 v20.4S, v27.4S, v14.4S // ....e......................................................................................................................................................................... - trn2 v14.4S, v27.4S, v14.4S // .....e........................................................................................................................................................................ - str q1, [x2, #-48] // .........................................................................................................................................................................*.... + // Instructions: 174 + // Expected cycles: 72 + // Expected IPC: 2.42 + // + // Wall time: 2084.65s + // User time: 2084.65s + // + // ----------------------------------------------------------------------------- original position -----------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + sqrdmulh v9.4S, v1.4S, v16.S[1] // ...................................................................................................................................................*.......................... + ldr q15, [x1, #0] // e............................................................................................................................................................................. + mul v1.4S, v1.4S, v16.S[0] // ..................................................................................................................................................*........................... + ldr q20, [x1, #16] // .e............................................................................................................................................................................ + sqrdmulh v24.4S, v23.4S, v16.S[1] // .............................................................................................................................................................*................ + // gap // .............................................................................................................................................................................. + ldr q19, [x1, #32] // ..e........................................................................................................................................................................... + ldr q13, [x1, #48] // ...e.......................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + str q25, [x2, #16] // .........................................................................................................................................................................*.... + mls v11.4S, v27.4S, v8.S[0] // ...................................................................................................................................................................*.......... + mul v7.4S, v23.4S, v16.S[0] // ............................................................................................................................................................*................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v1.4S, v9.4S, v8.S[0] // ....................................................................................................................................................*......................... + trn2 v25.4S, v15.4S, v20.4S // .....e........................................................................................................................................................................ + trn1 v17.4S, v15.4S, v20.4S // ....e......................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + ldr q9, [x5, #64] // ............................e................................................................................................................................................. + str q11, [x2, #48] // ...........................................................................................................................................................................*.. + trn2 v11.4S, v19.4S, v13.4S // .......e...................................................................................................................................................................... + trn1 v31.4S, v19.4S, v13.4S // ......e....................................................................................................................................................................... + ldr q21, [x5, #128] // ....................................................e......................................................................................................................... + mls v7.4S, v24.4S, v8.S[0] // ..............................................................................................................................................................*............... + // gap // .............................................................................................................................................................................. + ldr q3, [x5, #48] // ...........................e.................................................................................................................................................. + ldr q13, [x5, #160] // ......................................................e....................................................................................................................... + trn2 v26.2D, v25.2D, v11.2D // .........e.................................................................................................................................................................... + trn2 v20.2D, v17.2D, v31.2D // ........e..................................................................................................................................................................... + ldr q24, [x5, #32] // ..........................e................................................................................................................................................... + ldr q23, [x5, #80] // .............................e................................................................................................................................................ + trn1 v27.2D, v25.2D, v11.2D // ...........e.................................................................................................................................................................. + trn1 v11.2D, v17.2D, v31.2D // ..........e................................................................................................................................................................... + str q1, [x2], #(16*4) // ........................................................................................................................................................................*..... + ldr q16, [x5, #144] // .....................................................e........................................................................................................................ + str q7, [x2, #-32] // ..........................................................................................................................................................................*... add x2, x2, #64 // .............................................................................................................................................................................* + sub v18.4S, v20.4S, v26.4S // ...................................e.......................................................................................................................................... + add v25.4S, v20.4S, v26.4S // ....................................e......................................................................................................................................... + sub v14.4S, v11.4S, v27.4S // ..............................e............................................................................................................................................... + ldr q26, [x2, #16] // .............e................................................................................................................................................................ + add v20.4S, v11.4S, v27.4S // ...............................e.............................................................................................................................................. + ldr q0, [x2, #0] // ............e................................................................................................................................................................. + ldr q17, [x2, #32] // ..............e............................................................................................................................................................... + mul v27.4S, v18.4S, v9.4S // .....................................e........................................................................................................................................ + sqrdmulh v7.4S, v18.4S, v23.4S // ......................................e....................................................................................................................................... // gap // .............................................................................................................................................................................. - ldr q6, [x5, #144] // .....................................................e........................................................................................................................ - ldr q31, [x2, #32] // ..............e............................................................................................................................................................... - ldr q22, [x2, #48] // ...............e.............................................................................................................................................................. - trn2 v5.2D, v14.2D, v13.2D // .........e.................................................................................................................................................................... - trn2 v3.2D, v20.2D, v30.2D // ........e..................................................................................................................................................................... - ldr q29, [x2, #16] // .............e................................................................................................................................................................ - trn1 v18.2D, v14.2D, v13.2D // ...........e.................................................................................................................................................................. - ldr q25, [x2, #0] // ............e................................................................................................................................................................. - trn1 v20.2D, v20.2D, v30.2D // ..........e................................................................................................................................................................... - ldr q16, [x5, #48] // ...........................e.................................................................................................................................................. - sub v1.4S, v3.4S, v5.4S // ...................................e.......................................................................................................................................... - add v23.4S, v3.4S, v5.4S // ....................................e......................................................................................................................................... - ldr q27, [x5, #96] // ..................................................e........................................................................................................................... - sub v19.4S, v20.4S, v18.4S // ..............................e............................................................................................................................................... - add v21.4S, v20.4S, v18.4S // ...............................e.............................................................................................................................................. - ldr q10, [x5, #64] // ............................e................................................................................................................................................. - // gap // .............................................................................................................................................................................. - trn2 v15.4S, v31.4S, v22.4S // ...................e.......................................................................................................................................................... - ldr q18, [x5, #32] // ..........................e................................................................................................................................................... - trn1 v20.4S, v31.4S, v22.4S // ..................e........................................................................................................................................................... - ldr q3, [x4], #64 // ............................................................................................e................................................................................. - add v30.4S, v21.4S, v23.4S // .........................................e.................................................................................................................................... - trn2 v9.4S, v25.4S, v29.4S // .................e............................................................................................................................................................ - // gap // .............................................................................................................................................................................. - // gap // .............................................................................................................................................................................. - trn1 v12.4S, v25.4S, v29.4S // ................e............................................................................................................................................................. - sqrdmulh v0.4S, v19.4S, v16.4S // .................................e............................................................................................................................................ - // gap // .............................................................................................................................................................................. + sqrdmulh v11.4S, v14.4S, v3.4S // .................................e............................................................................................................................................ + mul v10.4S, v14.4S, v24.4S // ................................e............................................................................................................................................. + ldr q1, [x5, #16] // .........................e.................................................................................................................................................... + ldr q24, [x2, #48] // ...............e.............................................................................................................................................................. + ldr q23, [x5, #176] // .......................................................e...................................................................................................................... + sub v18.4S, v20.4S, v25.4S // ........................................e..................................................................................................................................... + add v3.4S, v20.4S, v25.4S // .........................................e.................................................................................................................................... + ldr q20, [x5], #(12*16) // ........................e..................................................................................................................................................... + trn1 v15.4S, v0.4S, v26.4S // ................e............................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v28.2D, v9.2D, v15.2D // .......................e...................................................................................................................................................... - mul v29.4S, v1.4S, v10.4S // .....................................e........................................................................................................................................ // gap // .............................................................................................................................................................................. - ldr q22, [x5], #(12*16) // ........................e..................................................................................................................................................... - trn1 v5.2D, v12.2D, v20.2D // ......................e....................................................................................................................................................... + mls v27.4S, v7.4S, v8.S[0] // .......................................e...................................................................................................................................... + trn2 v0.4S, v0.4S, v26.4S // .................e............................................................................................................................................................ + mls v10.4S, v11.4S, v8.S[0] // ..................................e........................................................................................................................................... // gap // .............................................................................................................................................................................. - mul v14.4S, v19.4S, v18.4S // ................................e............................................................................................................................................. + ldr q14, [x5, #-80] // ...................................................e.......................................................................................................................... + ldr q5, [x5, #-96] // ..................................................e........................................................................................................................... + trn2 v7.4S, v17.4S, v24.4S // ...................e.......................................................................................................................................................... // gap // .............................................................................................................................................................................. + sqrdmulh v26.4S, v18.4S, v1.4S // ...........................................e.................................................................................................................................. + trn1 v25.4S, v17.4S, v24.4S // ..................e........................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v7.2D, v9.2D, v15.2D // .....................e........................................................................................................................................................ - sqrdmulh v16.4S, v1.4S, v2.4S // ......................................e....................................................................................................................................... - trn2 v13.2D, v12.2D, v20.2D // ....................e......................................................................................................................................................... + mul v11.4S, v18.4S, v20.4S // ..........................................e................................................................................................................................... + trn2 v9.2D, v0.2D, v7.2D // .....................e........................................................................................................................................................ // gap // .............................................................................................................................................................................. - sub v1.4S, v5.4S, v28.4S // ........................................................e..................................................................................................................... // gap // .............................................................................................................................................................................. - sub v12.4S, v21.4S, v23.4S // ........................................e..................................................................................................................................... + sub v24.4S, v10.4S, v27.4S // .............................................e................................................................................................................................ + trn1 v31.2D, v0.2D, v7.2D // .......................e...................................................................................................................................................... + trn2 v18.2D, v15.2D, v25.2D // ....................e......................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v14.4S, v0.4S, v8.S[0] // ..................................e........................................................................................................................................... + mul v7.4S, v24.4S, v20.4S // ...............................................e.............................................................................................................................. + sqrdmulh v24.4S, v24.4S, v1.4S // ................................................e............................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v29.4S, v16.4S, v8.S[0] // .......................................e...................................................................................................................................... - sqrdmulh v20.4S, v1.4S, v6.4S // ...........................................................e.................................................................................................................. - mul v19.4S, v12.4S, v22.4S // ..........................................e................................................................................................................................... // gap // .............................................................................................................................................................................. + trn1 v1.2D, v15.2D, v25.2D // ......................e....................................................................................................................................................... // gap // .............................................................................................................................................................................. - sub v10.4S, v13.4S, v7.4S // .............................................................e................................................................................................................ - add v18.4S, v5.4S, v28.4S // .........................................................e.................................................................................................................... - mul v5.4S, v1.4S, v26.4S // ..........................................................e................................................................................................................... + sub v25.4S, v18.4S, v9.4S // .............................................................e................................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v9.4S, v10.4S, v24.4S // ...............................................................e.............................................................................................................. - ldr q6, [x4, #-48] // .............................................................................................e................................................................................ - sub v15.4S, v14.4S, v29.4S // .............................................e................................................................................................................................ + mls v11.4S, v26.4S, v8.S[0] // ............................................e................................................................................................................................. + add v20.4S, v10.4S, v27.4S // ..............................................e............................................................................................................................... + mls v7.4S, v24.4S, v8.S[0] // .................................................e............................................................................................................................ + sub v0.4S, v1.4S, v31.4S // ........................................................e..................................................................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v31.4S, v12.4S, v17.4S // ...........................................e.................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v4.4S, v10.4S, v4.4S // ................................................................e............................................................................................................. // gap // .............................................................................................................................................................................. + trn1 v15.4S, v3.4S, v20.4S // ............................................................................e................................................................................................. // gap // .............................................................................................................................................................................. + trn2 v24.4S, v3.4S, v20.4S // .............................................................................e................................................................................................ // gap // .............................................................................................................................................................................. - mls v5.4S, v20.4S, v8.S[0] // ............................................................e................................................................................................................. - add v21.4S, v14.4S, v29.4S // ..............................................e............................................................................................................................... - ldr q23, [x4, #-16] // ...............................................................................................e.............................................................................. - sqrdmulh v24.4S, v15.4S, v17.4S // ................................................e............................................................................................................................. - mul v29.4S, v15.4S, v22.4S // ...............................................e.............................................................................................................................. // gap // .............................................................................................................................................................................. + sqrdmulh v3.4S, v25.4S, v23.4S // ................................................................e............................................................................................................. + mul v23.4S, v0.4S, v21.4S // ..........................................................e................................................................................................................... + trn1 v20.4S, v11.4S, v7.4S // ..............................................................................e............................................................................................... + trn2 v7.4S, v11.4S, v7.4S // ...............................................................................e.............................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v9.4S, v4.4S, v8.S[0] // .................................................................e............................................................................................................ - trn1 v14.4S, v30.4S, v21.4S // ............................................................................e................................................................................................. - add v12.4S, v13.4S, v7.4S // ..............................................................e............................................................................................................... - ldr q25, [x4, #-32] // ..............................................................................................e............................................................................... + mul v10.4S, v25.4S, v13.4S // ...............................................................e.............................................................................................................. + sqrdmulh v26.4S, v0.4S, v16.4S // ...........................................................e.................................................................................................................. // gap // .............................................................................................................................................................................. - mls v19.4S, v31.4S, v8.S[0] // ............................................e................................................................................................................................. - mls v29.4S, v24.4S, v8.S[0] // .................................................e............................................................................................................................ // gap // .............................................................................................................................................................................. + trn1 v11.2D, v15.2D, v20.2D // ..................................................................................e........................................................................................... + trn1 v13.2D, v24.2D, v7.2D // ...................................................................................e.......................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v0.4S, v30.4S, v21.4S // .............................................................................e................................................................................................ + ldr q17, [x4, #32] // ..............................................................................................e............................................................................... + ldr q25, [x4, #48] // ...............................................................................................e.............................................................................. + add v9.4S, v18.4S, v9.4S // ..............................................................e............................................................................................................... + sub v27.4S, v11.4S, v13.4S // ................................................................................................e............................................................................. // gap // .............................................................................................................................................................................. - sub v2.4S, v5.4S, v9.4S // .......................................................................e...................................................................................................... - sub v30.4S, v18.4S, v12.4S // ..................................................................e........................................................................................................... - add v22.4S, v5.4S, v9.4S // ........................................................................e..................................................................................................... + ldr q0, [x4, #16] // .............................................................................................e................................................................................ + mls v23.4S, v26.4S, v8.S[0] // ............................................................e................................................................................................................. + add v21.4S, v1.4S, v31.4S // .........................................................e.................................................................................................................... // gap // .............................................................................................................................................................................. + mls v10.4S, v3.4S, v8.S[0] // .................................................................e............................................................................................................ // gap // .............................................................................................................................................................................. + trn2 v2.2D, v24.2D, v7.2D // .................................................................................e............................................................................................ // gap // .............................................................................................................................................................................. - sqrdmulh v13.4S, v2.4S, v11.4S // ..........................................................................e................................................................................................... // gap // .............................................................................................................................................................................. - mul v5.4S, v2.4S, v27.4S // .........................................................................e.................................................................................................... + add v18.4S, v11.4S, v13.4S // .................................................................................................e............................................................................ + add v1.4S, v21.4S, v9.4S // ...................................................................e.......................................................................................................... + sub v7.4S, v21.4S, v9.4S // ..................................................................e........................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v26.4S, v30.4S, v11.4S // .....................................................................e........................................................................................................ - trn2 v16.4S, v19.4S, v29.4S // ...............................................................................e.............................................................................................. - trn1 v20.4S, v19.4S, v29.4S // ..............................................................................e............................................................................................... + add v11.4S, v23.4S, v10.4S // ........................................................................e..................................................................................................... + sub v24.4S, v23.4S, v10.4S // .......................................................................e...................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v2.4S, v30.4S, v27.4S // ....................................................................e......................................................................................................... // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v7.4S, v14.4S // .....................................................................e........................................................................................................ + mul v23.4S, v7.4S, v5.4S // ....................................................................e......................................................................................................... // gap // .............................................................................................................................................................................. - trn2 v28.2D, v0.2D, v16.2D // .................................................................................e............................................................................................ - add v19.4S, v18.4S, v12.4S // ...................................................................e.......................................................................................................... - trn2 v9.2D, v14.2D, v20.2D // ................................................................................e............................................................................................. + sqrdmulh v7.4S, v24.4S, v14.4S // ..........................................................................e................................................................................................... + mul v24.4S, v24.4S, v5.4S // .........................................................................e.................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v5.4S, v13.4S, v8.S[0] // ...........................................................................e.................................................................................................. - trn1 v29.2D, v0.2D, v16.2D // ...................................................................................e.......................................................................................... + trn2 v3.4S, v1.4S, v11.4S // .....................................................................................e........................................................................................ + trn2 v9.2D, v15.2D, v20.2D // ................................................................................e............................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v2.4S, v26.4S, v8.S[0] // ......................................................................e....................................................................................................... + trn1 v20.4S, v1.4S, v11.4S // ....................................................................................e......................................................................................... // gap // .............................................................................................................................................................................. - sub v27.4S, v9.4S, v28.4S // .....................................................................................................e........................................................................ // gap // .............................................................................................................................................................................. - add v26.4S, v9.4S, v28.4S // ......................................................................................................e....................................................................... - trn1 v21.2D, v14.2D, v20.2D // ..................................................................................e........................................................................................... + mls v23.4S, v13.4S, v8.S[0] // ......................................................................e....................................................................................................... + sub v11.4S, v9.4S, v2.4S // .....................................................................................................e........................................................................ + mls v24.4S, v7.4S, v8.S[0] // ...........................................................................e.................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v10.4S, v19.4S, v22.4S // .....................................................................................e........................................................................................ + ldr q16, [x4], #64 // ............................................................................................e................................................................................. + mul v10.4S, v27.4S, v0.S[2] // ..................................................................................................e........................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v14.4S, v19.4S, v22.4S // ....................................................................................e......................................................................................... + sqrdmulh v7.4S, v27.4S, v0.S[3] // ...................................................................................................e.......................................................................... + add v9.4S, v9.4S, v2.4S // ......................................................................................................e....................................................................... // gap // .............................................................................................................................................................................. - mul v9.4S, v27.4S, v25.S[0] // .......................................................................................................e...................................................................... - trn1 v22.4S, v2.4S, v5.4S // ......................................................................................e....................................................................................... - add v16.4S, v21.4S, v29.4S // .................................................................................................e............................................................................ + sqrdmulh v27.4S, v11.4S, v17.S[1] // ........................................................................................................e..................................................................... // gap // .............................................................................................................................................................................. + trn2 v1.4S, v23.4S, v24.4S // .......................................................................................e...................................................................................... + trn1 v21.4S, v23.4S, v24.4S // ......................................................................................e....................................................................................... // gap // .............................................................................................................................................................................. - trn2 v28.4S, v2.4S, v5.4S // .......................................................................................e...................................................................................... // gap // .............................................................................................................................................................................. + mul v24.4S, v11.4S, v17.S[0] // .......................................................................................................e...................................................................... // gap // .............................................................................................................................................................................. - sub v15.4S, v21.4S, v29.4S // ................................................................................................e............................................................................. - sqrdmulh v21.4S, v27.4S, v25.S[1] // ........................................................................................................e..................................................................... // gap // .............................................................................................................................................................................. + mls v10.4S, v7.4S, v8.S[0] // ....................................................................................................e......................................................................... + trn2 v7.2D, v3.2D, v1.2D // .........................................................................................e.................................................................................... + trn2 v13.2D, v20.2D, v21.2D // ........................................................................................e..................................................................................... // gap // .............................................................................................................................................................................. - trn1 v20.2D, v14.2D, v22.2D // ..........................................................................................e................................................................................... - add v17.4S, v16.4S, v26.4S // .....................................................................................................................e........................................................ // gap // .............................................................................................................................................................................. + trn1 v26.2D, v3.2D, v1.2D // ...........................................................................................e.................................................................................. + trn1 v19.2D, v20.2D, v21.2D // ..........................................................................................e................................................................................... // gap // .............................................................................................................................................................................. - trn1 v2.2D, v10.2D, v28.2D // ...........................................................................................e.................................................................................. - trn2 v22.2D, v14.2D, v22.2D // ........................................................................................e..................................................................................... // gap // .............................................................................................................................................................................. + add v3.4S, v18.4S, v9.4S // .....................................................................................................................e........................................................ + sub v23.4S, v13.4S, v7.4S // ...............................................................................................................e.............................................................. // gap // .............................................................................................................................................................................. - trn2 v14.2D, v10.2D, v28.2D // .........................................................................................e.................................................................................... - mul v29.4S, v15.4S, v6.S[2] // ..................................................................................................e........................................................................... - sub v13.4S, v20.4S, v2.4S // ..........................................................................................................e................................................................... // gap // .............................................................................................................................................................................. + sub v21.4S, v19.4S, v26.4S // ..........................................................................................................e................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v11.4S, v15.4S, v6.S[3] // ...................................................................................................e.......................................................................... // gap // .............................................................................................................................................................................. + mls v24.4S, v27.4S, v8.S[0] // .........................................................................................................e.................................................................... + mul v11.4S, v23.4S, v25.S[0] // .................................................................................................................e............................................................ // gap // .............................................................................................................................................................................. - add v4.4S, v22.4S, v14.4S // ................................................................................................................e............................................................. // gap // .............................................................................................................................................................................. - sub v5.4S, v22.4S, v14.4S // ...............................................................................................................e.............................................................. - sqrdmulh v31.4S, v13.4S, v25.S[3] // .............................................................................................................e................................................................ + sqrdmulh v27.4S, v23.4S, v25.S[1] // ..................................................................................................................e........................................................... + add v7.4S, v13.4S, v7.4S // ................................................................................................................e............................................................. + mul v1.4S, v21.4S, v17.S[2] // ............................................................................................................e................................................................. // gap // .............................................................................................................................................................................. - mul v0.4S, v13.4S, v25.S[2] // ............................................................................................................e................................................................. - add v12.4S, v20.4S, v2.4S // ...........................................................................................................e.................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v28.4S, v5.4S, v23.S[1] // ..................................................................................................................e........................................................... - mul v15.4S, v5.4S, v23.S[0] // .................................................................................................................e............................................................ // gap // .............................................................................................................................................................................. + add v20.4S, v19.4S, v26.4S // ...........................................................................................................e.................................................................. + srshr v13.4S, v3.4S, #23 // ........................................................................................................................................e..................................... + sqrdmulh v21.4S, v21.4S, v17.S[3] // .............................................................................................................e................................................................ // gap // .............................................................................................................................................................................. - mls v29.4S, v11.4S, v8.S[0] // ....................................................................................................e......................................................................... // gap // .............................................................................................................................................................................. - add v13.4S, v12.4S, v4.4S // ...............................................................................................................................e.............................................. + add v15.4S, v10.4S, v24.4S // ..........................................................................................................................e................................................... + sub v25.4S, v20.4S, v7.4S // ..............................................................................................................................e............................................... + sub v23.4S, v18.4S, v9.4S // ....................................................................................................................e......................................................... // gap // .............................................................................................................................................................................. - mls v9.4S, v21.4S, v8.S[0] // .........................................................................................................e.................................................................... - mls v0.4S, v31.4S, v8.S[0] // ..............................................................................................................e............................................................... // gap // .............................................................................................................................................................................. + mls v11.4S, v27.4S, v8.S[0] // ...................................................................................................................e.......................................................... // gap // .............................................................................................................................................................................. - sub v21.4S, v12.4S, v4.4S // ..............................................................................................................................e............................................... // gap // .............................................................................................................................................................................. + add v20.4S, v20.4S, v7.4S // ...............................................................................................................................e.............................................. + mls v1.4S, v21.4S, v8.S[0] // ..............................................................................................................e............................................................... // gap // .............................................................................................................................................................................. - mls v15.4S, v28.4S, v8.S[0] // ...................................................................................................................e.......................................................... - srshr v30.4S, v17.4S, #23 // ........................................................................................................................................e..................................... // gap // .............................................................................................................................................................................. - sub v14.4S, v16.4S, v26.4S // ....................................................................................................................e......................................................... + srshr v27.4S, v15.4S, #23 // ..........................................................................................................................................e................................... // gap // .............................................................................................................................................................................. + mul v26.4S, v25.4S, v0.S[0] // ................................................................................................................................e............................................. + srshr v7.4S, v20.4S, #23 // ............................................................................................................................................e................................. // gap // .............................................................................................................................................................................. - sub v10.4S, v29.4S, v9.4S // .........................................................................................................................e.................................................... + mls v3.4S, v13.4S, v8.4S // .........................................................................................................................................e.................................... + sqrdmulh v25.4S, v25.4S, v0.S[1] // .................................................................................................................................e............................................ // gap // .............................................................................................................................................................................. - srshr v7.4S, v13.4S, #23 // ............................................................................................................................................e................................. - sub v16.4S, v0.4S, v15.4S // ...................................................................................................................................e.......................................... // gap // .............................................................................................................................................................................. - mul v31.4S, v14.4S, v3.S[2] // ......................................................................................................................e....................................................... + add v9.4S, v1.4S, v11.4S // ....................................................................................................................................e......................................... + sub v24.4S, v10.4S, v24.4S // .........................................................................................................................e.................................................... // gap // .............................................................................................................................................................................. - mls v17.4S, v30.4S, v8.4S // .........................................................................................................................................e.................................... - add v5.4S, v0.4S, v15.4S // ....................................................................................................................................e......................................... // gap // .............................................................................................................................................................................. + sub v1.4S, v1.4S, v11.4S // ...................................................................................................................................e.......................................... // gap // .............................................................................................................................................................................. - mls v13.4S, v7.4S, v8.4S // .............................................................................................................................................e................................ // gap // .............................................................................................................................................................................. - mul v19.4S, v10.4S, v3.S[2] // ...........................................................................................................................e.................................................. + mls v15.4S, v27.4S, v8.4S // ...........................................................................................................................................e.................................. // gap // .............................................................................................................................................................................. - sqrdmulh v4.4S, v16.4S, v6.S[1] // ......................................................................................................................................e....................................... - add v27.4S, v29.4S, v9.4S // ..........................................................................................................................e................................................... + srshr v27.4S, v9.4S, #23 // ..............................................................................................................................................e............................... + sqrdmulh v11.4S, v24.4S, v16.S[3] // ............................................................................................................................e................................................. // gap // .............................................................................................................................................................................. + mls v20.4S, v7.4S, v8.4S // .............................................................................................................................................e................................ + mul v13.4S, v24.4S, v16.S[2] // ...........................................................................................................................e.................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v9.4S, v14.4S, v3.S[3] // .......................................................................................................................e...................................................... - sqrdmulh v10.4S, v10.4S, v3.S[3] // ............................................................................................................................e................................................. // gap // .............................................................................................................................................................................. + mul v24.4S, v1.4S, v0.S[0] // .....................................................................................................................................e........................................ + sqrdmulh v7.4S, v1.4S, v0.S[1] // ......................................................................................................................................e....................................... // gap // .............................................................................................................................................................................. - srshr v1.4S, v5.4S, #23 // ..............................................................................................................................................e............................... // gap // .............................................................................................................................................................................. - sub v12.4S, v17.4S, v13.4S // ................................................................................................................................................e............................. + mls v9.4S, v27.4S, v8.4S // ...............................................................................................................................................e.............................. // gap // .............................................................................................................................................................................. - srshr v7.4S, v27.4S, #23 // ..........................................................................................................................................e................................... - sqrdmulh v26.4S, v21.4S, v6.S[1] // .................................................................................................................................e............................................ // gap // .............................................................................................................................................................................. + mul v10.4S, v23.4S, v16.S[2] // ......................................................................................................................e....................................................... + mls v13.4S, v11.4S, v8.S[0] // .............................................................................................................................e................................................ // gap // .............................................................................................................................................................................. - mul v18.4S, v12.4S, v3.S[0] // ..................................................................................................................................................e........................... - sqrdmulh v14.4S, v12.4S, v3.S[1] // ...................................................................................................................................................e.......................... // gap // .............................................................................................................................................................................. + sqrdmulh v23.4S, v23.4S, v16.S[3] // .......................................................................................................................e...................................................... + mls v24.4S, v7.4S, v8.S[0] // .......................................................................................................................................e...................................... // gap // .............................................................................................................................................................................. - mul v0.4S, v21.4S, v6.S[0] // ................................................................................................................................e............................................. - mls v5.4S, v1.4S, v8.4S // ...............................................................................................................................................e.............................. // gap // .............................................................................................................................................................................. + mls v26.4S, v25.4S, v8.S[0] // ..................................................................................................................................e........................................... + sub v11.4S, v15.4S, v9.4S // .....................................................................................................................................................e........................ // gap // .............................................................................................................................................................................. - mls v27.4S, v7.4S, v8.4S // ...........................................................................................................................................e.................................. - mul v1.4S, v16.4S, v6.S[0] // .....................................................................................................................................e........................................ + add v27.4S, v3.4S, v20.4S // .................................................................................................................................................e............................ // gap // .............................................................................................................................................................................. + mls v10.4S, v23.4S, v8.S[0] // ........................................................................................................................e..................................................... + add v9.4S, v15.4S, v9.4S // ......................................................................................................................................................e....................... // gap // .............................................................................................................................................................................. - mls v18.4S, v14.4S, v8.S[0] // ....................................................................................................................................................e......................... - mls v19.4S, v10.4S, v8.S[0] // .............................................................................................................................e................................................ // gap // .............................................................................................................................................................................. + add v23.4S, v13.4S, v24.4S // ................................................................................................................................................................e............. // gap // .............................................................................................................................................................................. - mls v31.4S, v9.4S, v8.S[0] // ........................................................................................................................e..................................................... - mls v0.4S, v26.4S, v8.S[0] // ..................................................................................................................................e........................................... // gap // .............................................................................................................................................................................. + sub v24.4S, v13.4S, v24.4S // ...............................................................................................................................................................e.............. + str q27, [x1], #(16*4) // ....................................................................................................................................................................e......... + sqrdmulh v7.4S, v11.4S, v16.S[1] // ........................................................................................................................................................e..................... // gap // .............................................................................................................................................................................. - mls v1.4S, v4.4S, v8.S[0] // .......................................................................................................................................e...................................... + mul v25.4S, v11.4S, v16.S[0] // .......................................................................................................................................................e...................... + add v22.4S, v10.4S, v26.4S // ...........................................................................................................................................................e.................. + str q9, [x1, #-48] // .....................................................................................................................................................................e........ // gap // .............................................................................................................................................................................. - add v22.4S, v27.4S, v5.4S // ......................................................................................................................................................e....................... - // gap // .............................................................................................................................................................................. - add v28.4S, v17.4S, v13.4S // .................................................................................................................................................e............................ - // gap // .............................................................................................................................................................................. - str q18, [x2], #(16*4) // ........................................................................................................................................................................e..... - sub v20.4S, v27.4S, v5.4S // .....................................................................................................................................................e........................ - str q22, [x1, #16] // .....................................................................................................................................................................e........ - sub v10.4S, v31.4S, v0.4S // ..........................................................................................................................................................e................... - // gap // .............................................................................................................................................................................. - add v5.4S, v31.4S, v0.4S // ...........................................................................................................................................................e.................. - str q28, [x1], #(16*4) // ....................................................................................................................................................................e......... - // gap // .............................................................................................................................................................................. - add v14.4S, v19.4S, v1.4S // ................................................................................................................................................................e............. - sub v17.4S, v19.4S, v1.4S // ...............................................................................................................................................................e.............. - mul v9.4S, v10.4S, v3.S[0] // ............................................................................................................................................................e................. - sqrdmulh v30.4S, v10.4S, v3.S[1] // .............................................................................................................................................................e................ - str q5, [x1, #-32] // ......................................................................................................................................................................e....... - // gap // .............................................................................................................................................................................. - - // original source code - // ldr q9, [x1, #0] // .....e....................................................................................................................................................................|........e................. - // ldr q10, [x1, #16] // ......e...................................................................................................................................................................|.........e................ - // ldr q11, [x1, #32] // .e........................................................................................................................................................................|....e..................... - // ldr q12, [x1, #48] // e.........................................................................................................................................................................|...e...................... - // trn1 v25.4s, v9.4s, v10.4s // ...................e......................................................................................................................................................|......................e... - // trn2 v26.4s, v9.4s, v10.4s // ....................e.....................................................................................................................................................|.......................e.. - // trn1 v27.4s, v11.4s, v12.4s // ...............e..........................................................................................................................................................|..................e....... - // trn2 v28.4s, v11.4s, v12.4s // .............e............................................................................................................................................................|................e......... - // trn2 v11.2d, v25.2d, v27.2d // ...........................e..............................................................................................................................................|.......................... - // trn2 v12.2d, v26.2d, v28.2d // ..........................e...............................................................................................................................................|.......................... - // trn1 v9.2d, v25.2d, v27.2d // ...............................e..........................................................................................................................................|.......................... - // trn1 v10.2d, v26.2d, v28.2d // .............................e............................................................................................................................................|.......................... - // ldr q13, [x2, #0] // ..............................e...........................................................................................................................................|.......................... - // ldr q14, [x2, #16] // ............................e.............................................................................................................................................|.......................... - // ldr q15, [x2, #32] // ........................e.................................................................................................................................................|.......................... - // ldr q16, [x2, #48] // .........................e................................................................................................................................................|.......................... - // trn1 v25.4s, v13.4s, v14.4s // .............................................e............................................................................................................................|.......................... - // trn2 v26.4s, v13.4s, v14.4s // ............................................e.............................................................................................................................|.......................... - // trn1 v27.4s, v15.4s, v16.4s // .........................................e................................................................................................................................|.......................... - // trn2 v28.4s, v15.4s, v16.4s // .......................................e..................................................................................................................................|.......................... - // trn2 v15.2d, v25.2d, v27.2d // ......................................................e...................................................................................................................|.......................... - // trn2 v16.2d, v26.2d, v28.2d // ....................................................e.....................................................................................................................|.......................... - // trn1 v13.2d, v25.2d, v27.2d // ..................................................e.......................................................................................................................|.......................... - // trn1 v14.2d, v26.2d, v28.2d // ...............................................e..........................................................................................................................|.......................... - // ldr q0, [x5], #(12*16) // .................................................e........................................................................................................................|.......................... - // ldr q4, [x5, #(-12*16 + 1*16)] // ..............e...........................................................................................................................................................|.................e........ - // ldr q1, [x5, #(-12*16 + 2*16)] // ........................................e.................................................................................................................................|.......................... - // ldr q5, [x5, #(-12*16 + 3*16)] // ................................e.........................................................................................................................................|.......................... - // ldr q2, [x5, #(-12*16 + 4*16)] // ......................................e...................................................................................................................................|.......................... - // ldr q6, [x5, #(-12*16 + 5*16)] // ........e.................................................................................................................................................................|...........e.............. - // sub v24.4s, v9.4s, v10.4s // ....................................e.....................................................................................................................................|.......................... - // add v9.4s, v9.4s, v10.4s // .....................................e....................................................................................................................................|.......................... - // mul v10.4s, v24.4s, v1.4s // ...................................................e......................................................................................................................|.......................... - // sqrdmulh v24.4s, v24.4s, v5.4s // ..............................................e...........................................................................................................................|.......................... - // mls v10.4s, v24.4s, v8.s[0] // .........................................................e................................................................................................................|.......................... - // sub v24.4s, v11.4s, v12.4s // .................................e........................................................................................................................................|.......................... - // add v11.4s, v11.4s, v12.4s // ..................................e.......................................................................................................................................|.......................... - // mul v12.4s, v24.4s, v2.4s // ................................................e.........................................................................................................................|.......................... - // sqrdmulh v24.4s, v24.4s, v6.4s // .....................................................e....................................................................................................................|.......................... - // mls v12.4s, v24.4s, v8.s[0] // ..........................................................e...............................................................................................................|.......................... - // sub v24.4s, v9.4s, v11.4s // ........................................................e.................................................................................................................|.......................... - // add v9.4s, v9.4s, v11.4s // ...........................................e..............................................................................................................................|.......................... - // mul v11.4s, v24.4s, v0.4s // ............................................................e.............................................................................................................|.......................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................................................e......................................................................................................|.......................... - // mls v11.4s, v24.4s, v8.s[0] // ..............................................................................e...........................................................................................|.......................... - // sub v24.4s, v10.4s, v12.4s // ..................................................................e.......................................................................................................|.......................... - // add v10.4s, v10.4s, v12.4s // ......................................................................e...................................................................................................|.......................... - // mul v12.4s, v24.4s, v0.4s // .........................................................................e................................................................................................|.......................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ........................................................................e.................................................................................................|.......................... - // mls v12.4s, v24.4s, v8.s[0] // ...............................................................................e..........................................................................................|.......................... - // ldr q0, [x5, #(-12*16 + 6*16)] // ...................................e......................................................................................................................................|.......................... - // ldr q4, [x5, #(-12*16 + 7*16)] // ...........e..............................................................................................................................................................|..............e........... - // ldr q1, [x5, #(-12*16 + 8*16)] // ..........e...............................................................................................................................................................|.............e............ - // ldr q5, [x5, #(-12*16 + 9*16)] // .......................e..................................................................................................................................................|.......................... - // ldr q2, [x5, #(-12*16 + 10*16)] // ..................e.......................................................................................................................................................|.....................e.... - // ldr q6, [x5, #(-12*16 + 11*16)] // .......e..................................................................................................................................................................|..........e............... - // sub v24.4s, v13.4s, v14.4s // .......................................................e..................................................................................................................|.......................... - // add v13.4s, v13.4s, v14.4s // ..............................................................e...........................................................................................................|.......................... - // mul v14.4s, v24.4s, v1.4s // ...............................................................e..........................................................................................................|.......................... - // sqrdmulh v24.4s, v24.4s, v5.4s // ...........................................................e..............................................................................................................|.......................... - // mls v14.4s, v24.4s, v8.s[0] // .....................................................................e....................................................................................................|.......................... - // sub v24.4s, v15.4s, v16.4s // .............................................................e............................................................................................................|.......................... - // add v15.4s, v15.4s, v16.4s // ............................................................................e.............................................................................................|.......................... - // mul v16.4s, v24.4s, v2.4s // ................................................................e.........................................................................................................|.......................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ....................................................................e.....................................................................................................|.......................... - // mls v16.4s, v24.4s, v8.s[0] // ..........................................................................e...............................................................................................|.......................... - // sub v24.4s, v13.4s, v15.4s // ..................................................................................e.......................................................................................|.......................... - // add v13.4s, v13.4s, v15.4s // ...........................................................................................e..............................................................................|.......................... - // mul v15.4s, v24.4s, v0.4s // .........................................................................................e................................................................................|.......................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ......................................................................................e...................................................................................|.......................... - // mls v15.4s, v24.4s, v8.s[0] // ...............................................................................................e..........................................................................|.......................... - // sub v24.4s, v14.4s, v16.4s // .................................................................................e........................................................................................|.......................... - // add v14.4s, v14.4s, v16.4s // ...................................................................................e......................................................................................|.......................... - // mul v16.4s, v24.4s, v0.4s // .....................................................................................e....................................................................................|.......................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ....................................................................................e.....................................................................................|.......................... - // mls v16.4s, v24.4s, v8.s[0] // .............................................................................................e............................................................................|.......................... - // trn1 v25.4s, v9.4s, v10.4s // ...........................................................................e..............................................................................................|.......................... - // trn2 v26.4s, v9.4s, v10.4s // ................................................................................e.........................................................................................|.......................... - // trn1 v27.4s, v11.4s, v12.4s // ........................................................................................e.................................................................................|.......................... - // trn2 v28.4s, v11.4s, v12.4s // .......................................................................................e..................................................................................|.......................... - // trn2 v11.2d, v25.2d, v27.2d // ............................................................................................e.............................................................................|.......................... - // trn2 v12.2d, v26.2d, v28.2d // ..........................................................................................e...............................................................................|.......................... - // trn1 v9.2d, v25.2d, v27.2d // ..................................................................................................e.......................................................................|.......................... - // trn1 v10.2d, v26.2d, v28.2d // ..............................................................................................e...........................................................................|.......................... - // trn1 v25.4s, v13.4s, v14.4s // ....................................................................................................e.....................................................................|.......................... - // trn2 v26.4s, v13.4s, v14.4s // ...................................................................................................e......................................................................|.......................... - // trn1 v27.4s, v15.4s, v16.4s // ......................................................................................................e...................................................................|.......................... - // trn2 v28.4s, v15.4s, v16.4s // ........................................................................................................e.................................................................|.......................... - // trn2 v15.2d, v25.2d, v27.2d // ..............................................................................................................e...........................................................|.......................... - // trn2 v16.2d, v26.2d, v28.2d // ...............................................................................................................e..........................................................|.......................... - // trn1 v13.2d, v25.2d, v27.2d // ...........................................................................................................e..............................................................|.......................... - // trn1 v14.2d, v26.2d, v28.2d // .............................................................................................................e............................................................|.......................... - // ldr q0, [x4], #64 // ..........................................e...............................................................................................................................|.......................... - // ldr q1, [x4, #(-64 + 16)] // .................................................................e........................................................................................................|.......................... - // ldr q2, [x4, #(-64 + 32)] // .............................................................................e............................................................................................|.......................... - // ldr q3, [x4, #(-64 + 48)] // .......................................................................e..................................................................................................|.......................... - // sub v24.4s, v9.4s, v10.4s // .........................................................................................................e................................................................|.......................... - // add v9.4s, v9.4s, v10.4s // .......................................................................................................e..................................................................|.......................... - // mul v10.4s, v24.4s, v1.s[2] // ................................................................................................................e.........................................................|.......................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..................................................................................................................e.......................................................|.......................... - // mls v10.4s, v24.4s, v8.s[0] // ..........................................................................................................................e...............................................|.......................... - // sub v24.4s, v11.4s, v12.4s // ................................................................................................e.........................................................................|.......................... - // add v11.4s, v11.4s, v12.4s // .................................................................................................e........................................................................|.......................... - // mul v12.4s, v24.4s, v2.s[0] // .....................................................................................................e....................................................................|.......................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........................................................................................................e...............................................................|.......................... - // mls v12.4s, v24.4s, v8.s[0] // ............................................................................................................................e.............................................|.......................... - // sub v24.4s, v13.4s, v14.4s // .................................................................................................................e........................................................|.......................... - // add v13.4s, v13.4s, v14.4s // .......................................................................................................................e..................................................|.......................... - // mul v14.4s, v24.4s, v2.s[2] // ......................................................................................................................e...................................................|.......................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // .....................................................................................................................e....................................................|.......................... - // mls v14.4s, v24.4s, v8.s[0] // .............................................................................................................................e............................................|.......................... - // sub v24.4s, v15.4s, v16.4s // ....................................................................................................................e.....................................................|.......................... - // add v15.4s, v15.4s, v16.4s // ...................................................................................................................e......................................................|.......................... - // mul v16.4s, v24.4s, v3.s[0] // .........................................................................................................................e................................................|.......................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................................................................................................e.................................................|.......................... - // mls v16.4s, v24.4s, v8.s[0] // ...............................................................................................................................e..........................................|.......................... - // sub v24.4s, v9.4s, v11.4s // .................................................................................................................................e........................................|.......................... - // add v9.4s, v9.4s, v11.4s // ............................................................................................................e.............................................................|.......................... - // mul v11.4s, v24.4s, v0.s[2] // .....................................................................................................................................e....................................|.......................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ............................................................................................................................................e.............................|.......................... - // mls v11.4s, v24.4s, v8.s[0] // ..........................................................................................................................................................e...............|.......................... - // sub v24.4s, v10.4s, v12.4s // ..................................................................................................................................e.......................................|.......................... - // add v10.4s, v10.4s, v12.4s // ...........................................................................................................................................e..............................|.......................... - // mul v12.4s, v24.4s, v0.s[2] // .........................................................................................................................................e................................|.......................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................................................................................................................e............................|.......................... - // mls v12.4s, v24.4s, v8.s[0] // .........................................................................................................................................................e................|.......................... - // sub v24.4s, v13.4s, v15.4s // ..............................................................................................................................e...........................................|.......................... - // add v13.4s, v13.4s, v15.4s // ...........................................................................................................................e..............................................|.......................... - // mul v15.4s, v24.4s, v1.s[0] // ....................................................................................................................................................e.....................|.......................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .................................................................................................................................................e........................|.......................... - // mls v15.4s, v24.4s, v8.s[0] // ...........................................................................................................................................................e..............|.......................... - // sub v24.4s, v14.4s, v16.4s // ....................................................................................................................................e.....................................|.......................... - // add v14.4s, v14.4s, v16.4s // .......................................................................................................................................e..................................|.......................... - // mul v16.4s, v24.4s, v1.s[0] // .......................................................................................................................................................e..................|.......................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................................................................................................................................e...............................|.......................... - // mls v16.4s, v24.4s, v8.s[0] // ............................................................................................................................................................e.............|.......................... - // srshr v24.4S, v9.4S, #23 // ................................................................................................................................e.........................................|.......................... - // mls v9.4s, v24.4s, v8.4s // ......................................................................................................................................e...................................|.......................... - // srshr v24.4S, v10.4S, #23 // ................................................................................................................................................e.........................|.......................... - // mls v10.4s, v24.4s, v8.4s // ......................................................................................................................................................e...................|.......................... - // srshr v24.4S, v13.4S, #23 // ...................................................................................................................................e......................................|.......................... - // mls v13.4s, v24.4s, v8.4s // ........................................................................................................................................e.................................|.......................... - // srshr v24.4S, v14.4S, #23 // ..............................................................................................................................................e...........................|.......................... - // mls v14.4s, v24.4s, v8.4s // .....................................................................................................................................................e....................|.......................... - // sub v24.4s, v9.4s, v13.4s // ...............................................................................................................................................e..........................|.......................... - // add v9.4s, v9.4s, v13.4s // ..............................................................................................................................................................e...........|.......................... - // mul v13.4s, v24.4s, v0.s[0] // ..................................................................................................................................................e.......................|.......................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................e......................|.......................... - // mls v13.4s, v24.4s, v8.s[0] // ........................................................................................................................................................e.................|.......................... - // sub v24.4s, v10.4s, v14.4s // ................................................................................................................................................................e.........|.......................... - // add v10.4s, v10.4s, v14.4s // .............................................................................................................................................................e............|.......................... - // mul v14.4s, v24.4s, v0.s[0] // ..*.......................................................................................................................................................................|.....*.................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...*......................................................................................................................................................................|......*................... - // mls v14.4s, v24.4s, v8.s[0] // ............*.............................................................................................................................................................|...............*.......... - // sub v24.4s, v11.4s, v15.4s // ..................................................................................................................................................................e.......|.......................... - // add v11.4s, v11.4s, v15.4s // ...................................................................................................................................................................e......|.......................... - // mul v15.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................e..|.......................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................................................................................................................................e.|.......................... - // mls v15.4s, v24.4s, v8.s[0] // ....*.....................................................................................................................................................................|.......*.................. - // sub v24.4s, v12.4s, v16.4s // ......................................................................................................................................................................e...|.......................... - // add v12.4s, v12.4s, v16.4s // .....................................................................................................................................................................e....|.......................... - // mul v16.4s, v24.4s, v0.s[0] // ..........................................................................................................................................................................|..*....................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........................................................................................................................................................................|.*........................ - // mls v16.4s, v24.4s, v8.s[0] // .........*................................................................................................................................................................|............*............. - // str q9, [x1], #(16*4) // ....................................................................................................................................................................e.....|.......................... - // str q10, [x1, #(-16*4 + 1*16)] // .................................................................................................................................................................e........|.......................... - // str q11, [x1, #(-16*4 + 2*16)] // .........................................................................................................................................................................e|.......................... - // str q12, [x1, #(-16*4 + 3*16)] // ..........................................................................................................................................................................*.......................... - // str q13, [x2], #(16*4) // ...............................................................................................................................................................e..........|.......................... - // str q14, [x2, #(-16*4 + 1*16)] // .....................*....................................................................................................................................................|........................*. - // str q15, [x2, #(-16*4 + 2*16)] // ................*.........................................................................................................................................................|...................*...... - // str q16, [x2, #(-16*4 + 3*16)] // .................*........................................................................................................................................................|....................*..... - // add x1, x1, #64 // ..........................................................................................................................................................................|*......................... - // add x2, x2, #64 // ......................*...................................................................................................................................................|.........................* + sqrdmulh v27.4S, v24.4S, v16.S[1] // ..................................................................................................................................................................e........... + str q23, [x1, #-16] // .......................................................................................................................................................................e...... + sub v1.4S, v3.4S, v20.4S // ................................................................................................................................................e............................. + mul v11.4S, v24.4S, v16.S[0] // .................................................................................................................................................................e............ + // gap // .............................................................................................................................................................................. + mls v25.4S, v7.4S, v8.S[0] // .........................................................................................................................................................e.................... + str q22, [x1, #-32] // ......................................................................................................................................................................e....... + add x1, x1, #64 // ............................................................................................................................................................................e. + sub v23.4S, v10.4S, v26.4S // ..........................................................................................................................................................e................... + + // ---------------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|--- + // ldr q9, [x1, #0] // e............................................................................................................................................................................'~............................. + // ldr q10, [x1, #16] // ..e..........................................................................................................................................................................'..~........................... + // ldr q11, [x1, #32] // ....e........................................................................................................................................................................'....~......................... + // ldr q12, [x1, #48] // .....e.......................................................................................................................................................................'.....~........................ + // trn1 v25.4s, v9.4s, v10.4s // ...........e.................................................................................................................................................................'...........~.................. + // trn2 v26.4s, v9.4s, v10.4s // ..........e..................................................................................................................................................................'..........~................... + // trn1 v27.4s, v11.4s, v12.4s // ...............e.............................................................................................................................................................'...............~.............. + // trn2 v28.4s, v11.4s, v12.4s // ..............e..............................................................................................................................................................'..............~............... + // trn2 v11.2d, v25.2d, v27.2d // .....................e.......................................................................................................................................................'.....................~........ + // trn2 v12.2d, v26.2d, v28.2d // ....................e........................................................................................................................................................'....................~......... + // trn1 v9.2d, v25.2d, v27.2d // .........................e...................................................................................................................................................'.........................~.... + // trn1 v10.2d, v26.2d, v28.2d // ........................e....................................................................................................................................................'........................~..... + // ldr q13, [x2, #0] // ...................................e.........................................................................................................................................'.............................. + // ldr q14, [x2, #16] // .................................e...........................................................................................................................................'.............................. + // ldr q15, [x2, #32] // ....................................e........................................................................................................................................'.............................. + // ldr q16, [x2, #48] // ..........................................e..................................................................................................................................'.............................. + // trn1 v25.4s, v13.4s, v14.4s // ...............................................e.............................................................................................................................'.............................. + // trn2 v26.4s, v13.4s, v14.4s // .................................................e...........................................................................................................................'.............................. + // trn1 v27.4s, v15.4s, v16.4s // .......................................................e.....................................................................................................................'.............................. + // trn2 v28.4s, v15.4s, v16.4s // .....................................................e.......................................................................................................................'.............................. + // trn2 v15.2d, v25.2d, v27.2d // ............................................................e................................................................................................................'.............................. + // trn2 v16.2d, v26.2d, v28.2d // .........................................................e...................................................................................................................'.............................. + // trn1 v13.2d, v25.2d, v27.2d // ...............................................................e.............................................................................................................'.............................. + // trn1 v14.2d, v26.2d, v28.2d // ...........................................................e.................................................................................................................'.............................. + // ldr q0, [x5], #(12*16) // ..............................................e..............................................................................................................................'.............................. + // ldr q4, [x5, #(-12*16 + 1*16)] // .........................................e...................................................................................................................................'.............................. + // ldr q1, [x5, #(-12*16 + 2*16)] // ......................e......................................................................................................................................................'......................~....... + // ldr q5, [x5, #(-12*16 + 3*16)] // ..................e..........................................................................................................................................................'..................~........... + // ldr q2, [x5, #(-12*16 + 4*16)] // ............e................................................................................................................................................................'............~................. + // ldr q6, [x5, #(-12*16 + 5*16)] // .......................e.....................................................................................................................................................'.......................~...... + // sub v24.4s, v9.4s, v10.4s // ................................e............................................................................................................................................'.............................. + // add v9.4s, v9.4s, v10.4s // ..................................e..........................................................................................................................................'.............................. + // mul v10.4s, v24.4s, v1.4s // ........................................e....................................................................................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v5.4s // .......................................e.....................................................................................................................................'.............................. + // mls v10.4s, v24.4s, v8.s[0] // ..................................................e..........................................................................................................................'.............................. + // sub v24.4s, v11.4s, v12.4s // ..............................e..............................................................................................................................................'.............................. + // add v11.4s, v11.4s, v12.4s // ...............................e.............................................................................................................................................'.............................. + // mul v12.4s, v24.4s, v2.4s // .....................................e.......................................................................................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v6.4s // ......................................e......................................................................................................................................'.............................. + // mls v12.4s, v24.4s, v8.s[0] // ................................................e............................................................................................................................'.............................. + // sub v24.4s, v9.4s, v11.4s // ............................................e................................................................................................................................'.............................. + // add v9.4s, v9.4s, v11.4s // .............................................e...............................................................................................................................'.............................. + // mul v11.4s, v24.4s, v0.4s // ........................................................e....................................................................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v4.4s // ......................................................e......................................................................................................................'.............................. + // mls v11.4s, v24.4s, v8.s[0] // .................................................................e...........................................................................................................'.............................. + // sub v24.4s, v10.4s, v12.4s // ..........................................................e..................................................................................................................'.............................. + // add v10.4s, v10.4s, v12.4s // ..................................................................e..........................................................................................................'.............................. + // mul v12.4s, v24.4s, v0.4s // .............................................................e...............................................................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v4.4s // ..............................................................e..............................................................................................................'.............................. + // mls v12.4s, v24.4s, v8.s[0] // ...................................................................e.........................................................................................................'.............................. + // ldr q0, [x5, #(-12*16 + 6*16)] // ....................................................e........................................................................................................................'.............................. + // ldr q4, [x5, #(-12*16 + 7*16)] // ...................................................e.........................................................................................................................'.............................. + // ldr q1, [x5, #(-12*16 + 8*16)] // ................e............................................................................................................................................................'................~............. + // ldr q5, [x5, #(-12*16 + 9*16)] // ...........................e.................................................................................................................................................'...........................~.. + // ldr q2, [x5, #(-12*16 + 10*16)] // ...................e.........................................................................................................................................................'...................~.......... + // ldr q6, [x5, #(-12*16 + 11*16)] // ...........................................e.................................................................................................................................'.............................. + // sub v24.4s, v13.4s, v14.4s // ....................................................................e........................................................................................................'.............................. + // add v13.4s, v13.4s, v14.4s // .....................................................................................e.......................................................................................'.............................. + // mul v14.4s, v24.4s, v1.4s // ........................................................................e....................................................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v5.4s // ............................................................................e................................................................................................'.............................. + // mls v14.4s, v24.4s, v8.s[0] // ....................................................................................e........................................................................................'.............................. + // sub v24.4s, v15.4s, v16.4s // ................................................................e............................................................................................................'.............................. + // add v15.4s, v15.4s, v16.4s // .................................................................................e...........................................................................................'.............................. + // mul v16.4s, v24.4s, v2.4s // ...........................................................................e.................................................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v6.4s // .......................................................................e.....................................................................................................'.............................. + // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................e......................................................................................'.............................. + // sub v24.4s, v13.4s, v15.4s // ..........................................................................................e..................................................................................'.............................. + // add v13.4s, v13.4s, v15.4s // .........................................................................................e...................................................................................'.............................. + // mul v15.4s, v24.4s, v0.4s // ..............................................................................................e..............................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v4.4s // .............................................................................................e...............................................................................'.............................. + // mls v15.4s, v24.4s, v8.s[0] // ....................................................................................................e........................................................................'.............................. + // sub v24.4s, v14.4s, v16.4s // ............................................................................................e................................................................................'.............................. + // add v14.4s, v14.4s, v16.4s // ...........................................................................................e.................................................................................'.............................. + // mul v16.4s, v24.4s, v0.4s // ................................................................................................e............................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................................................................................e.............................................................................'.............................. + // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................................e......................................................................'.............................. + // trn1 v25.4s, v9.4s, v10.4s // .....................................................................e.......................................................................................................'.............................. + // trn2 v26.4s, v9.4s, v10.4s // ......................................................................e......................................................................................................'.............................. + // trn1 v27.4s, v11.4s, v12.4s // .........................................................................e...................................................................................................'.............................. + // trn2 v28.4s, v11.4s, v12.4s // ..........................................................................e..................................................................................................'.............................. + // trn2 v11.2d, v25.2d, v27.2d // ..................................................................................................e..........................................................................'.............................. + // trn2 v12.2d, v26.2d, v28.2d // .......................................................................................e.....................................................................................'.............................. + // trn1 v9.2d, v25.2d, v27.2d // .............................................................................e...............................................................................................'.............................. + // trn1 v10.2d, v26.2d, v28.2d // ..............................................................................e..............................................................................................'.............................. + // trn1 v25.4s, v13.4s, v14.4s // ...................................................................................................e.........................................................................'.............................. + // trn2 v26.4s, v13.4s, v14.4s // .................................................................................................e...........................................................................'.............................. + // trn1 v27.4s, v15.4s, v16.4s // .............................................................................................................e...............................................................'.............................. + // trn2 v28.4s, v15.4s, v16.4s // ............................................................................................................e................................................................'.............................. + // trn2 v15.2d, v25.2d, v27.2d // .................................................................................................................e...........................................................'.............................. + // trn2 v16.2d, v26.2d, v28.2d // ................................................................................................................e............................................................'.............................. + // trn1 v13.2d, v25.2d, v27.2d // ...................................................................................................................e.........................................................'.............................. + // trn1 v14.2d, v26.2d, v28.2d // ..................................................................................................................e..........................................................'.............................. + // ldr q0, [x4], #64 // .......................................................................................................e.....................................................................'.............................. + // ldr q1, [x4, #(-64 + 16)] // ...................................................................................e.........................................................................................'.............................. + // ldr q2, [x4, #(-64 + 32)] // ...............................................................................e.............................................................................................'.............................. + // ldr q3, [x4, #(-64 + 48)] // ................................................................................e............................................................................................'.............................. + // sub v24.4s, v9.4s, v10.4s // ..................................................................................e..........................................................................................'.............................. + // add v9.4s, v9.4s, v10.4s // ........................................................................................e....................................................................................'.............................. + // mul v10.4s, v24.4s, v1.s[2] // ........................................................................................................e....................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .........................................................................................................e...................................................................'.............................. + // mls v10.4s, v24.4s, v8.s[0] // ...............................................................................................................e.............................................................'.............................. + // sub v24.4s, v11.4s, v12.4s // .....................................................................................................e.......................................................................'.............................. + // add v11.4s, v11.4s, v12.4s // ..........................................................................................................e..................................................................'.............................. + // mul v12.4s, v24.4s, v2.s[0] // ..............................................................................................................e..............................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...........................................................................................................e.................................................................'.............................. + // mls v12.4s, v24.4s, v8.s[0] // .......................................................................................................................e.....................................................'.............................. + // sub v24.4s, v13.4s, v14.4s // ......................................................................................................................e......................................................'.............................. + // add v13.4s, v13.4s, v14.4s // ............................................................................................................................e................................................'.............................. + // mul v14.4s, v24.4s, v2.s[2] // ...........................................................................................................................e.................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..............................................................................................................................e..............................................'.............................. + // mls v14.4s, v24.4s, v8.s[0] // ....................................................................................................................................e........................................'.............................. + // sub v24.4s, v15.4s, v16.4s // .....................................................................................................................e.......................................................'.............................. + // add v15.4s, v15.4s, v16.4s // ..........................................................................................................................e..................................................'.............................. + // mul v16.4s, v24.4s, v3.s[0] // ........................................................................................................................e....................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .........................................................................................................................e...................................................'.............................. + // mls v16.4s, v24.4s, v8.s[0] // ..................................................................................................................................e..........................................'.............................. + // sub v24.4s, v9.4s, v11.4s // .................................................................................................................................e...........................................'.............................. + // add v9.4s, v9.4s, v11.4s // ....................................................................................................................e........................................................'.............................. + // mul v11.4s, v24.4s, v0.s[2] // .....................................................................................................................................................e.......................'.............................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................................................e.....................'.............................. + // mls v11.4s, v24.4s, v8.s[0] // ............................................................................................................................................................e................'.............................. + // sub v24.4s, v10.4s, v12.4s // ...........................................................................................................................................e.................................'.............................. + // add v10.4s, v10.4s, v12.4s // ...............................................................................................................................e.............................................'.............................. + // mul v12.4s, v24.4s, v0.s[2] // .................................................................................................................................................e...........................'.............................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...............................................................................................................................................e.............................'.............................. + // mls v12.4s, v24.4s, v8.s[0] // ......................................................................................................................................................e......................'.............................. + // sub v24.4s, v13.4s, v15.4s // ................................................................................................................................e............................................'.............................. + // add v13.4s, v13.4s, v15.4s // ...................................................................................................................................e.........................................'.............................. + // mul v15.4s, v24.4s, v1.s[0] // ......................................................................................................................................e......................................'.............................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................................................................................................................e...................................'.............................. + // mls v15.4s, v24.4s, v8.s[0] // .........................................................................................................................................................e...................'.............................. + // sub v24.4s, v14.4s, v16.4s // ............................................................................................................................................e................................'.............................. + // add v14.4s, v14.4s, v16.4s // ..........................................................................................................................................e..................................'.............................. + // mul v16.4s, v24.4s, v1.s[0] // ..................................................................................................................................................e..........................'.............................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................................................................e.........................'.............................. + // mls v16.4s, v24.4s, v8.s[0] // ........................................................................................................................................................e....................'.............................. + // srshr v24.4S, v9.4S, #23 // .............................................................................................................................e...............................................'.............................. + // mls v9.4s, v24.4s, v8.4s // ........................................................................................................................................e....................................'.............................. + // srshr v24.4S, v10.4S, #23 // .....................................................................................................................................e.......................................'.............................. + // mls v10.4s, v24.4s, v8.4s // .............................................................................................................................................e...............................'.............................. + // srshr v24.4S, v13.4S, #23 // .......................................................................................................................................e.....................................'.............................. + // mls v13.4s, v24.4s, v8.4s // ................................................................................................................................................e............................'.............................. + // srshr v24.4S, v14.4S, #23 // ..............................................................................................................................................e..............................'.............................. + // mls v14.4s, v24.4s, v8.4s // ....................................................................................................................................................e........................'.............................. + // sub v24.4s, v9.4s, v13.4s // .......................................................................................................................................................................e.....'.............................. + // add v9.4s, v9.4s, v13.4s // ...........................................................................................................................................................e.................'.............................. + // mul v13.4s, v24.4s, v0.s[0] // .~...........................................................................................................................................................................'.*............................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................................................................................................................................*.............................. + // mls v13.4s, v24.4s, v8.s[0] // .........~...................................................................................................................................................................'.........*.................... + // sub v24.4s, v10.4s, v14.4s // ..........................................................................................................................................................e..................'.............................. + // add v10.4s, v10.4s, v14.4s // .............................................................................................................................................................e...............'.............................. + // mul v14.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................e..........'.............................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................................................................................................................................................e...........'.............................. + // mls v14.4s, v24.4s, v8.s[0] // .........................................................................................................................................................................e...'.............................. + // sub v24.4s, v11.4s, v15.4s // ............................................................................................................................................................................e'.............................. + // add v11.4s, v11.4s, v15.4s // ...................................................................................................................................................................e.........'.............................. + // mul v15.4s, v24.4s, v0.s[0] // ........~....................................................................................................................................................................'........*..................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...~.........................................................................................................................................................................'...*.......................... + // mls v15.4s, v24.4s, v8.s[0] // .................~...........................................................................................................................................................'.................*............ + // sub v24.4s, v12.4s, v16.4s // ...............................................................................................................................................................e.............'.............................. + // add v12.4s, v12.4s, v16.4s // ..............................................................................................................................................................e..............'.............................. + // mul v16.4s, v24.4s, v0.s[0] // ........................................................................................................................................................................e....'.............................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................................................................................................................................e.......'.............................. + // mls v16.4s, v24.4s, v8.s[0] // .......~.....................................................................................................................................................................'.......*...................... + // str q9, [x1], #(16*4) // ................................................................................................................................................................e............'.............................. + // str q10, [x1, #(-16*4 + 1*16)] // ....................................................................................................................................................................e........'.............................. + // str q11, [x1, #(-16*4 + 2*16)] // ..........................................................................................................................................................................e..'.............................. + // str q12, [x1, #(-16*4 + 3*16)] // ......................................................................................................................................................................e......'.............................. + // str q13, [x2], #(16*4) // ..........................~..................................................................................................................................................'..........................*... + // str q14, [x2, #(-16*4 + 1*16)] // ......~......................................................................................................................................................................'......*....................... + // str q15, [x2, #(-16*4 + 2*16)] // ............................~................................................................................................................................................'............................*. + // str q16, [x2, #(-16*4 + 3*16)] // .............~...............................................................................................................................................................'.............*................ + // add x1, x1, #64 // ...........................................................................................................................................................................e.'.............................. + // add x2, x2, #64 // .............................~...............................................................................................................................................'.............................* sub count, count, #1 cbnz count, layer45678_start - // gap // ............. - // gap // ............. - sqrdmulh v19.4S, v17.4S, v3.S[1] // ..*.......... - mul v17.4S, v17.4S, v3.S[0] // ...*......... - // gap // ............. - // gap // ............. - mul v29.4S, v20.4S, v3.S[0] // ....*........ - sqrdmulh v10.4S, v20.4S, v3.S[1] // .....*....... - // gap // ............. - // gap // ............. - // gap // ............. - mls v9.4S, v30.4S, v8.S[0] // ......*...... - // gap // ............. - // gap // ............. - // gap // ............. - mls v17.4S, v19.4S, v8.S[0] // .......*..... - // gap // ............. - // gap // ............. - str q14, [x1, #-16] // *............ - mls v29.4S, v10.4S, v8.S[0] // ........*.... - // gap // ............. - // gap // ............. - // gap // ............. - str q9, [x2, #-32] // .........*... - // gap // ............. - // gap // ............. - // gap // ............. - str q17, [x2, #-16] // ..........*.. - // gap // ............. - add x1, x1, #64 // .*........... - str q29, [x2, #-48] // ...........*. - add x2, x2, #64 // ............* - - // original source code - // str q14, [x1, #-16] // ......*...... - // add x1, x1, #64 // ..........*.. - // sqrdmulh v0.4S, v17.4S, v3.S[1] // *............ - // mul v19.4S, v17.4S, v3.S[0] // .*........... - // mul v1.4S, v20.4S, v3.S[0] // ..*.......... - // sqrdmulh v25.4S, v20.4S, v3.S[1] // ...*......... - // mls v9.4S, v30.4S, v8.S[0] // ....*........ - // mls v19.4S, v0.4S, v8.S[0] // .....*....... - // mls v1.4S, v25.4S, v8.S[0] // .......*..... - // str q9, [x2, #-32] // ........*.... - // str q19, [x2, #-16] // .........*... - // str q1, [x2, #-48] // ...........*. - // add x2, x2, #64 // ............* + // Instructions: 12 + // Expected cycles: 8 + // Expected IPC: 1.50 + // + // Wall time: 0.07s + // User time: 0.07s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + mul v10.4S, v1.4S, v16.S[0] // .*............................ + sqrdmulh v24.4S, v1.4S, v16.S[1] // *............................. + // gap // .............................. + // gap // .............................. + mul v7.4S, v23.4S, v16.S[0] // .....*........................ + sqrdmulh v9.4S, v23.4S, v16.S[1] // ..*........................... + // gap // .............................. + // gap // .............................. + mls v11.4S, v27.4S, v8.S[0] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v10.4S, v24.4S, v8.S[0] // ......*....................... + mls v7.4S, v9.4S, v8.S[0] // ........*..................... + // gap // .............................. + // gap // .............................. + str q25, [x2, #16] // ...*.......................... + // gap // .............................. + // gap // .............................. + str q11, [x2, #48] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q10, [x2], #(16*4) // .........*.................... + // gap // .............................. + str q7, [x2, #-32] // ..........*................... + // gap // .............................. + add x2, x2, #64 // ...........*.................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // sqrdmulh v9.4S, v1.4S, v16.S[1] // .*............................. + // mul v1.4S, v1.4S, v16.S[0] // *.............................. + // sqrdmulh v24.4S, v23.4S, v16.S[1] // ...*........................... + // str q25, [x2, #16] // .......*....................... + // mls v11.4S, v27.4S, v8.S[0] // ....*.......................... + // mul v7.4S, v23.4S, v16.S[0] // ..*............................ + // mls v1.4S, v9.4S, v8.S[0] // .....*......................... + // str q11, [x2, #48] // ........*...................... + // mls v7.4S, v24.4S, v8.S[0] // ......*........................ + // str q1, [x2], #(16*4) // .........*..................... + // str q7, [x2, #-32] // ..........*.................... + // add x2, x2, #64 // ...........*................... // ----------------------------------------------------------------------------- @@ -1377,7 +1401,7 @@ layer45678_start: ASM_LOAD(xtmp, ninv_tw_addr) ld1r {ninv_tw.4s}, [xtmp] - ushr modulus_half.4S, modulus.4S, #1 + ushr modulus_half.4S, consts.4S, #1 neg neg_modulus_half.4S, modulus_half.4S mov count, #8 @@ -1385,706 +1409,746 @@ layer45678_start: load_roots_123 .p2align 2 - ldr q28, [x0, #256] // .....*........ - // gap // .............. - // gap // .............. - ldr q9, [x0, #384] // .*............ - ldr q14, [x0, #512] // ..*........... - // gap // .............. - // gap // .............. - ldr q19, [x0, #640] // ....*......... - ldr q22, [x0, #768] // ...*.......... - // gap // .............. - // gap // .............. - // gap // .............. - // gap // .............. - // gap // .............. - // gap // .............. - ldr q16, [x0, #896] // *............. - sub v15.4S, v28.4S, v9.4S // .......*...... - add v6.4S, v28.4S, v9.4S // .............* - // gap // .............. - // gap // .............. - add v9.4S, v14.4S, v19.4S // ............*. - // gap // .............. - // gap // .............. - sub v4.4S, v14.4S, v19.4S // ......*....... - // gap // .............. - mul v17.4S, v15.4S, v2.S[0] // ...........*.. - sqrdmulh v21.4S, v15.4S, v2.S[1] // ..........*... - // gap // .............. - // gap // .............. - // gap // .............. - mul v28.4S, v4.4S, v2.S[2] // ........*..... - add v7.4S, v22.4S, v16.4S // .........*.... - - // original source code - // ldr q16, [x0, #896] // .....*........ - // ldr q27, [x0, #384] // .*............ - // ldr q20, [x0, #512] // ..*........... - // ldr q22, [x0, #768] // ....*......... - // ldr q6, [x0, #640] // ...*.......... - // ldr q18, [x0, #256] // *............. - // sub v4.4S, v20.4S, v6.4S // .........*.... - // sub v19.4S, v18.4S, v27.4S // ......*....... - // mul v28.4S, v4.4S, v2.S[2] // ............*. - // add v7.4S, v22.4S, v16.4S // .............* - // sqrdmulh v21.4S, v19.4S, v2.S[1] // ...........*.. - // mul v17.4S, v19.4S, v2.S[0] // ..........*... - // add v9.4S, v20.4S, v6.4S // ........*..... - // add v6.4S, v18.4S, v27.4S // .......*...... + // Instructions: 95 + // Expected cycles: 47 + // Expected IPC: 2.02 + // + // Wall time: 38.83s + // User time: 38.83s + // + // ------------------------------------- original position --------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------------- + // gap // ............................................................................................... + // gap // ............................................................................................... + ldr q7, [x0, #768] // .*............................................................................................. + ldr q11, [x0, #896] // ....*.......................................................................................... + ldr q23, [x0, #640] // ...*........................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + ldr q20, [x0, #512] // ........*...................................................................................... + ldr q21, [x0, #128] // .....*......................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + ldr q9, [x0, #0] // ......*........................................................................................ + ldr q15, [x0, #256] // ..*............................................................................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v27.4S, v7.4S, v11.4S // ..............................*................................................................ + add v5.4S, v7.4S, v11.4S // .........*..................................................................................... + ldr q28, [x0, #384] // *.............................................................................................. + // gap // ............................................................................................... + sub v7.4S, v20.4S, v23.4S // ..............*................................................................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + add v17.4S, v20.4S, v23.4S // ...............*............................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v23.4S, v27.4S, v3.S[0] // ....................................*.......................................................... + sqrdmulh v27.4S, v27.4S, v3.S[1] // .....................................*......................................................... + sqrdmulh v11.4S, v7.4S, v2.S[3] // ..........................*.................................................................... + mul v13.4S, v7.4S, v2.S[2] // ............................*.................................................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v16.4S, v9.4S, v21.4S // ............*.................................................................................. + add v24.4S, v15.4S, v28.4S // ...........*................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v22.4S, v15.4S, v28.4S // .......*....................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + add v7.4S, v9.4S, v21.4S // .............*................................................................................. + mls v13.4S, v11.4S, v8.S[0] // .........................................*..................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v23.4S, v27.4S, v8.S[0] // ..........................................*.................................................... + mul v20.4S, v16.4S, v1.S[2] // .........................*..................................................................... + // gap // ............................................................................................... + sub v27.4S, v7.4S, v24.4S // .................*............................................................................. + // gap // ............................................................................................... + add v18.4S, v7.4S, v24.4S // ........................*...................................................................... + sub v9.4S, v17.4S, v5.4S // ...................*........................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v24.4S, v22.4S, v2.S[1] // .................................*............................................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v11.4S, v13.4S, v23.4S // ....................................................*.......................................... + mul v14.4S, v27.4S, v0.S[2] // ....................*.......................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v15.4S, v27.4S, v0.S[3] // .....................*......................................................................... + sqrdmulh v27.4S, v11.4S, v1.S[1] // ........................................................*...................................... + mul v7.4S, v11.4S, v1.S[0] // .......................................................*....................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v21.4S, v9.4S, v1.S[0] // .......................*....................................................................... + sqrdmulh v11.4S, v9.4S, v1.S[1] // ......................*........................................................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v16.4S, v16.4S, v1.S[3] // ................*.............................................................................. + mul v9.4S, v22.4S, v2.S[0] // ..........*.................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v7.4S, v27.4S, v8.S[0] // ..............................................................*................................ + add v27.4S, v17.4S, v5.4S // ..................*............................................................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v21.4S, v11.4S, v8.S[0] // .............................*................................................................. + mls v14.4S, v15.4S, v8.S[0] // ...........................*................................................................... + // gap // ............................................................................................... + mls v9.4S, v24.4S, v8.S[0] // ........................................*...................................................... + mls v20.4S, v16.4S, v8.S[0] // ................................*.............................................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v4.4S, v18.4S, v27.4S // ...............................*............................................................... + add v17.4S, v18.4S, v27.4S // ..................................*............................................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + add v24.4S, v13.4S, v23.4S // ...............................................*............................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v11.4S, v14.4S, v21.4S // ...................................*........................................................... + add v13.4S, v20.4S, v9.4S // ..............................................*................................................ + sub v27.4S, v20.4S, v9.4S // .............................................*................................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v22.4S, v11.4S, v0.S[0] // ......................................*........................................................ + sqrdmulh v20.4S, v11.4S, v0.S[1] // .......................................*....................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v11.4S, v13.4S, v24.4S // ..........................................................*.................................... + mul v10.4S, v4.4S, v0.S[0] // ............................................*.................................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v23.4S, v27.4S, v0.S[3] // .....................................................*......................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v9.4S, v27.4S, v0.S[2] // ...................................................*........................................... + mls v22.4S, v20.4S, v8.S[0] // ...........................................*................................................... + mul v27.4S, v11.4S, v0.S[0] // ................................................................*.............................. + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v15.4S, v11.4S, v0.S[1] // .............................................................*................................. + add v18.4S, v13.4S, v24.4S // ............................................................*.................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v9.4S, v23.4S, v8.S[0] // ...........................................................*................................... + add v11.4S, v14.4S, v21.4S // .......................................................................*....................... + cmge v20.4S, v31.4S, v22.4S // ..................................................*............................................ + mul v24.4S, v18.4S, v25.4S // ...................................................................*........................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v21.4S, v11.4S, v26.4S // ..............................................................................*................ + // gap // ............................................................................................... + cmge v16.4S, v22.4S, v30.4S // .................................................*............................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v13.4S, v9.4S, v7.4S // ....................................................................*.......................... + add v23.4S, v9.4S, v7.4S // .....................................................................*......................... + mul v9.4S, v11.4S, v25.4S // ...........................................................................*................... + sub v11.4S, v20.4S, v16.4S // ......................................................*........................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v7.4S, v13.4S, v0.S[0] // ..........................................................................*.................... + sqrdmulh v20.4S, v13.4S, v0.S[1] // ........................................................................*...................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v27.4S, v15.4S, v8.S[0] // ......................................................................*........................ + mul v13.4S, v23.4S, v25.4S // .........................................................................*..................... + sqrdmulh v15.4S, v23.4S, v26.4S // .............................................................................*................. + mls v22.4S, v11.4S, v8.4S // .........................................................*..................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v9.4S, v21.4S, v8.S[0] // ...................................................................................*........... + mls v7.4S, v20.4S, v8.S[0] // ................................................................................*.............. + sqrdmulh v11.4S, v4.4S, v0.S[1] // ................................................*.............................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v18.4S, v18.4S, v26.4S // .................................................................*............................. + mls v13.4S, v15.4S, v8.S[0] // .................................................................................*............. + str q22, [x0, #768] // ...............................................................*............................... + // gap // ............................................................................................... + cmge v28.4S, v31.4S, v27.4S // ............................................................................*.................. + cmge v20.4S, v7.4S, v30.4S // .....................................................................................*......... + cmge v21.4S, v31.4S, v7.4S // ....................................................................................*.......... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v10.4S, v11.4S, v8.S[0] // ..................................................................*............................ + cmge v22.4S, v31.4S, v9.4S // ...........................................................................................*... + cmge v11.4S, v13.4S, v30.4S // .......................................................................................*....... + // gap // ............................................................................................... + // gap // ............................................................................................... + cmge v23.4S, v31.4S, v13.4S // ......................................................................................*........ + sub v20.4S, v21.4S, v20.4S // ........................................................................................*...... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v24.4S, v18.4S, v8.S[0] // ...............................................................................*............... + sub v11.4S, v23.4S, v11.4S // ..........................................................................................*.... + mul v23.4S, v17.4S, v25.4S // ..............................................................................................* + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + cmge v29.4S, v31.4S, v10.4S // ..................................................................................*............ + mls v7.4S, v20.4S, v8.4S // ............................................................................................*.. + mls v13.4S, v11.4S, v8.4S // .............................................................................................*. + // gap // ............................................................................................... + // gap // ............................................................................................... + cmge v14.4S, v24.4S, v30.4S // .........................................................................................*..... + + // ---------------------------------------- new position ----------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------------- + // ldr q15, [x0, #384] // .........*..................................................................................... + // ldr q12, [x0, #768] // *.............................................................................................. + // ldr q20, [x0, #256] // ......*........................................................................................ + // ldr q19, [x0, #640] // ..*............................................................................................ + // ldr q4, [x0, #896] // .*............................................................................................. + // ldr q11, [x0, #128] // ....*.......................................................................................... + // ldr q5, [x0, #0] // .....*......................................................................................... + // sub v14.4S, v20.4S, v15.4S // ..................*............................................................................ + // ldr q17, [x0, #512] // ...*........................................................................................... + // add v28.4S, v12.4S, v4.4S // ........*...................................................................................... + // mul v21.4S, v14.4S, v2.S[0] // ...................................*........................................................... + // add v15.4S, v20.4S, v15.4S // .................*............................................................................. + // sub v20.4S, v5.4S, v11.4S // ................*.............................................................................. + // add v5.4S, v5.4S, v11.4S // ...................*........................................................................... + // sub v29.4S, v17.4S, v19.4S // ..........*.................................................................................... + // add v11.4S, v17.4S, v19.4S // ...........*................................................................................... + // sqrdmulh v19.4S, v20.4S, v1.S[3] // ..................................*............................................................ + // sub v6.4S, v5.4S, v15.4S // .......................*....................................................................... + // add v7.4S, v11.4S, v28.4S // .....................................*......................................................... + // sub v28.4S, v11.4S, v28.4S // .........................*..................................................................... + // mul v11.4S, v6.4S, v0.S[2] // ............................*.................................................................. + // sqrdmulh v17.4S, v6.4S, v0.S[3] // .............................*................................................................. + // sqrdmulh v6.4S, v28.4S, v1.S[1] // .................................*............................................................. + // mul v18.4S, v28.4S, v1.S[0] // ................................*.............................................................. + // add v27.4S, v5.4S, v15.4S // ........................*...................................................................... + // mul v20.4S, v20.4S, v1.S[2] // ......................*........................................................................ + // sqrdmulh v28.4S, v29.4S, v2.S[3] // ..............*................................................................................ + // mls v11.4S, v17.4S, v8.S[0] // .......................................*....................................................... + // mul v15.4S, v29.4S, v2.S[2] // ...............*............................................................................... + // mls v18.4S, v6.4S, v8.S[0] // ......................................*........................................................ + // sub v5.4S, v12.4S, v4.4S // .......*....................................................................................... + // sub v6.4S, v27.4S, v7.4S // ..........................................*.................................................... + // mls v20.4S, v19.4S, v8.S[0] // .........................................*..................................................... + // sqrdmulh v29.4S, v14.4S, v2.S[1] // ..........................*.................................................................... + // add v17.4S, v27.4S, v7.4S // ...........................................*................................................... + // sub v10.4S, v11.4S, v18.4S // .............................................*................................................. + // mul v7.4S, v5.4S, v3.S[0] // ............*.................................................................................. + // sqrdmulh v19.4S, v5.4S, v3.S[1] // .............*................................................................................. + // mul v27.4S, v10.4S, v0.S[0] // ................................................*.............................................. + // sqrdmulh v10.4S, v10.4S, v0.S[1] // .................................................*............................................. + // mls v21.4S, v29.4S, v8.S[0] // ........................................*...................................................... + // mls v15.4S, v28.4S, v8.S[0] // ....................*.......................................................................... + // mls v7.4S, v19.4S, v8.S[0] // .....................*......................................................................... + // mls v27.4S, v10.4S, v8.S[0] // ......................................................*........................................ + // mul v10.4S, v6.4S, v0.S[0] // ...................................................*........................................... + // sub v29.4S, v20.4S, v21.4S // ...............................................*............................................... + // add v22.4S, v20.4S, v21.4S // ..............................................*................................................ + // add v4.4S, v15.4S, v7.4S // ............................................*.................................................. + // sqrdmulh v5.4S, v6.4S, v0.S[1] // ............................................................................*.................. + // cmge v12.4S, v27.4S, v30.4S // ...............................................................*............................... + // cmge v19.4S, v31.4S, v27.4S // ............................................................*.................................. + // mul v21.4S, v29.4S, v0.S[2] // .....................................................*......................................... + // sub v28.4S, v15.4S, v7.4S // ...........................*................................................................... + // sqrdmulh v6.4S, v29.4S, v0.S[3] // ....................................................*.......................................... + // sub v29.4S, v19.4S, v12.4S // ...................................................................*........................... + // mul v20.4S, v28.4S, v1.S[0] // ...............................*............................................................... + // sqrdmulh v14.4S, v28.4S, v1.S[1] // ..............................*................................................................ + // mls v27.4S, v29.4S, v8.4S // .........................................................................*..................... + // sub v12.4S, v22.4S, v4.4S // ..................................................*............................................ + // mls v21.4S, v6.4S, v8.S[0] // ..........................................................*.................................... + // add v29.4S, v22.4S, v4.4S // .........................................................*..................................... + // sqrdmulh v22.4S, v12.4S, v0.S[1] // ........................................................*...................................... + // mls v20.4S, v14.4S, v8.S[0] // ....................................*.......................................................... + // str q27, [x0, #768] // ...............................................................................*............... + // mul v27.4S, v12.4S, v0.S[0] // .......................................................*....................................... + // sqrdmulh v6.4S, v29.4S, v26.4S // .............................................................................*................. + // mls v10.4S, v5.4S, v8.S[0] // ...................................................................................*........... + // mul v24.4S, v29.4S, v25.4S // .............................................................*................................. + // sub v29.4S, v21.4S, v20.4S // ................................................................*.............................. + // add v12.4S, v21.4S, v20.4S // .................................................................*............................. + // mls v27.4S, v22.4S, v8.S[0] // ......................................................................*........................ + // add v14.4S, v11.4S, v18.4S // ...........................................................*................................... + // sqrdmulh v19.4S, v29.4S, v0.S[1] // .....................................................................*......................... + // mul v13.4S, v12.4S, v25.4S // .......................................................................*....................... + // mul v7.4S, v29.4S, v0.S[0] // ....................................................................*.......................... + // mul v9.4S, v14.4S, v25.4S // ..................................................................*............................ + // cmge v28.4S, v31.4S, v27.4S // ................................................................................*.............. + // sqrdmulh v12.4S, v12.4S, v26.4S // ........................................................................*...................... + // sqrdmulh v20.4S, v14.4S, v26.4S // ..............................................................*................................ + // mls v24.4S, v6.4S, v8.S[0] // ........................................................................................*...... + // mls v7.4S, v19.4S, v8.S[0] // ...........................................................................*................... + // mls v13.4S, v12.4S, v8.S[0] // ..............................................................................*................ + // cmge v29.4S, v31.4S, v10.4S // ...........................................................................................*... + // mls v9.4S, v20.4S, v8.S[0] // ..........................................................................*.................... + // cmge v15.4S, v31.4S, v7.4S // ..................................................................................*............ + // cmge v21.4S, v7.4S, v30.4S // .................................................................................*............. + // cmge v5.4S, v31.4S, v13.4S // ......................................................................................*........ + // cmge v12.4S, v13.4S, v30.4S // .....................................................................................*......... + // sub v15.4S, v15.4S, v21.4S // .......................................................................................*....... + // cmge v14.4S, v24.4S, v30.4S // ..............................................................................................* + // sub v21.4S, v5.4S, v12.4S // .........................................................................................*..... + // cmge v22.4S, v31.4S, v9.4S // ....................................................................................*.......... + // mls v7.4S, v15.4S, v8.4S // ............................................................................................*.. + // mls v13.4S, v21.4S, v8.4S // .............................................................................................*. + // mul v23.4S, v17.4S, v25.4S // ..........................................................................................*.... sub count, count, #1 layer123_start: - sqrdmulh v19.4S, v4.4S, v2.S[3] // .....................*.................................................................................................. - ldr q20, [x0, #128] // .*...................................................................................................................... - sub v23.4S, v22.4S, v16.4S // .......................*................................................................................................ - ldr q4, [x0, #0] // *....................................................................................................................... - mls v17.4S, v21.4S, v8.S[0] // .................*...................................................................................................... + // Instructions: 120 + // Expected cycles: 52 + // Expected IPC: 2.31 + // + // Wall time: 122.00s + // User time: 122.00s + // + // -------------------------------------------------- original position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------------- + sqrdmulh v5.4S, v17.4S, v26.4S // .........................................................................................*.............................. + cmge v18.4S, v10.4S, v30.4S // .....................................................................*.................................................. + ldr q15, [x0, #400] // ...e.................................................................................................................... + ldr q12, [x0, #784] // ......e................................................................................................................. + ldr q20, [x0, #272] // ..e..................................................................................................................... + cmge v16.4S, v9.4S, v30.4S // .............................................................................................................*.......... + cmge v17.4S, v27.4S, v30.4S // .........................................................................*.............................................. + ldr q19, [x0, #656] // .....e.................................................................................................................. + ldr q4, [x0, #912] // .......e................................................................................................................ + sub v6.4S, v29.4S, v18.4S // ......................................................................*................................................. + cmge v21.4S, v31.4S, v24.4S // ........................................................................................................*............... + // gap // ........................................................................................................................ + mls v23.4S, v5.4S, v8.S[0] // ..........................................................................................*............................. + // gap // ........................................................................................................................ + sub v16.4S, v22.4S, v16.4S // ..............................................................................................................*......... + ldr q11, [x0, #144] // .e...................................................................................................................... + ldr q5, [x0, #16] // e....................................................................................................................... + sub v18.4S, v28.4S, v17.4S // ..........................................................................*............................................. + // gap // ........................................................................................................................ + sub v22.4S, v21.4S, v14.4S // ..........................................................................................................*............. + sub v14.4S, v20.4S, v15.4S // .............e.......................................................................................................... // gap // ........................................................................................................................ - ldr q16, [x0, #912] // .......e................................................................................................................ - sub v10.4S, v9.4S, v7.4S // ......................................*................................................................................. - ldr q27, [x0, #400] // ...e.................................................................................................................... + mls v9.4S, v16.4S, v8.4S // ...............................................................................................................*........ + ldr q17, [x0, #528] // ....e................................................................................................................... // gap // ........................................................................................................................ - sqrdmulh v5.4S, v23.4S, v3.S[1] // ..........................*............................................................................................. - add v24.4S, v9.4S, v7.4S // .......................................*................................................................................ // gap // ........................................................................................................................ + cmge v16.4S, v31.4S, v23.4S // ....................................................................................................*................... + add v28.4S, v12.4S, v4.4S // ........................e............................................................................................... + mul v21.4S, v14.4S, v2.S[0] // ...............e........................................................................................................ // gap // ........................................................................................................................ - mul v15.4S, v23.4S, v3.S[0] // .........................*.............................................................................................. - mls v28.4S, v19.4S, v8.S[0] // ......................*................................................................................................. // gap // ........................................................................................................................ + add v15.4S, v20.4S, v15.4S // ..............e......................................................................................................... + sub v20.4S, v5.4S, v11.4S // ........e............................................................................................................... + add v5.4S, v5.4S, v11.4S // .........e.............................................................................................................. // gap // ........................................................................................................................ - add v19.4S, v4.4S, v20.4S // .........*.............................................................................................................. - sqrdmulh v11.4S, v10.4S, v1.S[1] // .........................................*.............................................................................. - mul v13.4S, v10.4S, v1.S[0] // ........................................*............................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v22.4S, v4.4S, v20.4S // ........*............................................................................................................... - add v4.4S, v19.4S, v6.4S // .............................*.......................................................................................... - ldr q20, [x0, #528] // ....e................................................................................................................... // gap // ........................................................................................................................ - mls v15.4S, v5.4S, v8.S[0] // ...........................*............................................................................................ + sub v29.4S, v17.4S, v19.4S // ..................e..................................................................................................... + add v11.4S, v17.4S, v19.4S // ...................e.................................................................................................... // gap // ........................................................................................................................ + mls v10.4S, v6.4S, v8.4S // .......................................................................*................................................ // gap // ........................................................................................................................ - mul v12.4S, v22.4S, v1.S[2] // ..........*............................................................................................................. - sqrdmulh v10.4S, v22.4S, v1.S[3] // ...........*............................................................................................................ - add v9.4S, v4.4S, v24.4S // .................................................*...................................................................... - sub v7.4S, v19.4S, v6.4S // ............................*........................................................................................... + sqrdmulh v19.4S, v20.4S, v1.S[3] // ...........e............................................................................................................ // gap // ........................................................................................................................ + mls v27.4S, v18.4S, v8.4S // ...........................................................................*............................................ // gap // ........................................................................................................................ + sub v6.4S, v5.4S, v15.4S // ............................e........................................................................................... // gap // ........................................................................................................................ + str q7, [x0, #896] // .......................................................................................*................................ + add v7.4S, v11.4S, v28.4S // .......................................e................................................................................ + sub v28.4S, v11.4S, v28.4S // ......................................e................................................................................. // gap // ........................................................................................................................ - sub v14.4S, v4.4S, v24.4S // ................................................*....................................................................... - sub v6.4S, v28.4S, v15.4S // ...........................................*............................................................................ // gap // ........................................................................................................................ + mul v11.4S, v6.4S, v0.S[2] // ..............................e......................................................................................... + sqrdmulh v17.4S, v6.4S, v0.S[3] // ...............................e........................................................................................ + sqrdmulh v6.4S, v28.4S, v1.S[1] // .........................................e.............................................................................. // gap // ........................................................................................................................ - sqrdmulh v21.4S, v7.4S, v0.S[3] // ...............................*........................................................................................ - mls v12.4S, v10.4S, v8.S[0] // ............*........................................................................................................... + str q27, [x0, #640] // .....................................................................................*.................................. + mul v18.4S, v28.4S, v1.S[0] // ........................................e............................................................................... + add v27.4S, v5.4S, v15.4S // .............................e.......................................................................................... + mul v20.4S, v20.4S, v1.S[2] // ..........e............................................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v18.4S, v6.4S, v1.S[0] // .............................................*.......................................................................... - sqrdmulh v4.4S, v6.4S, v1.S[1] // ..............................................*......................................................................... - ldr q22, [x0, #784] // ......e................................................................................................................. // gap // ........................................................................................................................ - sqrdmulh v10.4S, v14.4S, v0.S[1] // ...................................................*.................................................................... - mul v5.4S, v7.4S, v0.S[2] // ..............................*......................................................................................... // gap // ........................................................................................................................ + sqrdmulh v28.4S, v29.4S, v2.S[3] // .....................e.................................................................................................. + mls v11.4S, v17.4S, v8.S[0] // ................................e....................................................................................... // gap // ........................................................................................................................ - sqrdmulh v23.4S, v9.4S, v26.4S // .........................................................................................*.............................. - add v24.4S, v28.4S, v15.4S // ............................................*........................................................................... // gap // ........................................................................................................................ + mul v15.4S, v29.4S, v2.S[2] // ....................e................................................................................................... + mls v18.4S, v6.4S, v8.S[0] // ..........................................e............................................................................. + sub v5.4S, v12.4S, v4.4S // .......................e................................................................................................ // gap // ........................................................................................................................ - mls v18.4S, v4.4S, v8.S[0] // ...............................................*........................................................................ - add v6.4S, v12.4S, v17.4S // ..................................*..................................................................................... + sub v6.4S, v27.4S, v7.4S // ................................................e....................................................................... // gap // ........................................................................................................................ + mls v20.4S, v19.4S, v8.S[0] // ............e........................................................................................................... // gap // ........................................................................................................................ - mls v5.4S, v21.4S, v8.S[0] // ................................*....................................................................................... - sub v4.4S, v12.4S, v17.4S // .................................*...................................................................................... // gap // ........................................................................................................................ + sqrdmulh v29.4S, v14.4S, v2.S[1] // ................e....................................................................................................... + add v17.4S, v27.4S, v7.4S // .................................................e...................................................................... // gap // ........................................................................................................................ - add v19.4S, v6.4S, v24.4S // ......................................................*................................................................. - mls v13.4S, v11.4S, v8.S[0] // ..........................................*............................................................................. - sub v7.4S, v6.4S, v24.4S // .....................................................*.................................................................. + str q10, [x0, #512] // ....................................................................................*................................... + sub v10.4S, v11.4S, v18.4S // ..........................................................e............................................................. + mul v7.4S, v5.4S, v3.S[0] // .........................e.............................................................................................. + sqrdmulh v19.4S, v5.4S, v3.S[1] // ..........................e............................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v12.4S, v9.4S, v25.4S // ........................................................................................*............................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v15.4S, v19.4S, v25.4S // ...........................................................................................*............................ - sqrdmulh v24.4S, v19.4S, v26.4S // ............................................................................................*........................... - sqrdmulh v6.4S, v4.4S, v0.S[3] // ....................................*................................................................................... + mul v27.4S, v10.4S, v0.S[0] // ............................................................e........................................................... + sqrdmulh v10.4S, v10.4S, v0.S[1] // .............................................................e.......................................................... + mls v21.4S, v29.4S, v8.S[0] // .................e...................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v9.4S, v5.4S, v13.4S // ...........................................................*............................................................ - mul v19.4S, v7.4S, v0.S[0] // .......................................................*................................................................ - mul v28.4S, v4.4S, v0.S[2] // ...................................*.................................................................................... + mls v15.4S, v28.4S, v8.S[0] // ......................e................................................................................................. // gap // ........................................................................................................................ + mls v7.4S, v19.4S, v8.S[0] // ...........................e............................................................................................ // gap // ........................................................................................................................ + mls v24.4S, v22.4S, v8.4S // ...........................................................................................................*............ + mls v27.4S, v10.4S, v8.S[0] // ..............................................................e......................................................... + mul v10.4S, v6.4S, v0.S[0] // ..................................................e..................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v4.4S, v9.4S, v26.4S // ...............................................................................................*........................ - mls v15.4S, v24.4S, v8.S[0] // .............................................................................................*.......................... - sub v21.4S, v5.4S, v13.4S // ..........................................................*............................................................. - mul v13.4S, v9.4S, v25.4S // ..............................................................................................*......................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v9.4S, v7.4S, v0.S[1] // ........................................................*............................................................... - mls v28.4S, v6.4S, v8.S[0] // .....................................*.................................................................................. + sub v29.4S, v20.4S, v21.4S // .................................e...................................................................................... + add v22.4S, v20.4S, v21.4S // ..................................e..................................................................................... + add v4.4S, v15.4S, v7.4S // ............................................e........................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v24.4S, v31.4S, v15.4S // ........................................................................................................*............... - mul v14.4S, v14.4S, v0.S[0] // ..................................................*..................................................................... + sqrdmulh v5.4S, v6.4S, v0.S[1] // ...................................................e.................................................................... // gap // ........................................................................................................................ - ldr q6, [x0, #656] // .....e.................................................................................................................. - mls v13.4S, v4.4S, v8.S[0] // ................................................................................................*....................... // gap // ........................................................................................................................ + cmge v12.4S, v27.4S, v30.4S // .............................................................................e.......................................... + cmge v19.4S, v31.4S, v27.4S // ............................................................................e........................................... // gap // ........................................................................................................................ - cmge v7.4S, v15.4S, v30.4S // .........................................................................................................*.............. - add v11.4S, v28.4S, v18.4S // ................................................................*....................................................... - sub v4.4S, v28.4S, v18.4S // ...............................................................*........................................................ // gap // ........................................................................................................................ + mul v21.4S, v29.4S, v0.S[2] // ...................................e.................................................................................... + sub v28.4S, v15.4S, v7.4S // ...........................................e............................................................................ + sqrdmulh v6.4S, v29.4S, v0.S[3] // ....................................e................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + sub v29.4S, v19.4S, v12.4S // ..............................................................................e......................................... + mul v20.4S, v28.4S, v1.S[0] // .............................................e.......................................................................... // gap // ........................................................................................................................ - mls v14.4S, v10.4S, v8.S[0] // ....................................................*................................................................... - mul v5.4S, v21.4S, v0.S[0] // ............................................................*........................................................... // gap // ........................................................................................................................ - cmge v28.4S, v13.4S, v30.4S // .............................................................................................................*.......... - sqrdmulh v10.4S, v11.4S, v26.4S // ..................................................................................................*..................... + sqrdmulh v14.4S, v28.4S, v1.S[1] // ..............................................e......................................................................... + mls v27.4S, v29.4S, v8.4S // ...............................................................................e........................................ + sub v12.4S, v22.4S, v4.4S // .....................................................e.................................................................. // gap // ........................................................................................................................ - cmge v17.4S, v31.4S, v13.4S // ............................................................................................................*........... - mul v18.4S, v11.4S, v25.4S // .................................................................................................*...................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v19.4S, v9.4S, v8.S[0] // .........................................................*.............................................................. - mul v9.4S, v4.4S, v0.S[0] // .................................................................*...................................................... + mls v21.4S, v6.4S, v8.S[0] // .....................................e.................................................................................. + add v29.4S, v22.4S, v4.4S // ......................................................e................................................................. + sqrdmulh v22.4S, v12.4S, v0.S[1] // ........................................................e............................................................... // gap // ........................................................................................................................ - sub v28.4S, v17.4S, v28.4S // ..............................................................................................................*......... + mls v20.4S, v14.4S, v8.S[0] // ...............................................e........................................................................ // gap // ........................................................................................................................ - sub v24.4S, v24.4S, v7.4S // ..........................................................................................................*............. + str q27, [x0, #784] // ......................................................................................e................................. + mul v27.4S, v12.4S, v0.S[0] // .......................................................e................................................................ // gap // ........................................................................................................................ - sqrdmulh v17.4S, v4.4S, v0.S[1] // ..................................................................*..................................................... + sqrdmulh v6.4S, v29.4S, v26.4S // ............................................................................................e........................... + mls v10.4S, v5.4S, v8.S[0] // ....................................................e................................................................... // gap // ........................................................................................................................ + str q24, [x0, #128] // .....................................................................................................................*.. + mul v24.4S, v29.4S, v25.4S // ...........................................................................................e............................ // gap // ........................................................................................................................ - mls v18.4S, v10.4S, v8.S[0] // ...................................................................................................*.................... - mls v13.4S, v28.4S, v29.4S // ...............................................................................................................*........ + sub v29.4S, v21.4S, v20.4S // ...............................................................e........................................................ + add v12.4S, v21.4S, v20.4S // ................................................................e....................................................... // gap // ........................................................................................................................ + mls v27.4S, v22.4S, v8.S[0] // .........................................................e.............................................................. + str q13, [x0, #384] // .......................................................................................................................* // gap // ........................................................................................................................ - cmge v10.4S, v31.4S, v14.4S // ....................................................................*................................................... - sqrdmulh v7.4S, v21.4S, v0.S[1] // .............................................................*.......................................................... + add v14.4S, v11.4S, v18.4S // ...........................................................e............................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v4.4S, v19.4S, v30.4S // .........................................................................*.............................................. - cmge v11.4S, v31.4S, v19.4S // ........................................................................*............................................... + sqrdmulh v19.4S, v29.4S, v0.S[1] // ..................................................................e..................................................... + mul v13.4S, v12.4S, v25.4S // .................................................................................................e...................... + str q9, [x0, #256] // ......................................................................................................................*. + mul v7.4S, v29.4S, v0.S[0] // .................................................................e...................................................... + mul v9.4S, v14.4S, v25.4S // ..............................................................................................e......................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v28.4S, v18.4S, v30.4S // .................................................................................................................*...... - str q13, [x0, #256] // ......................................................................................................................*. // gap // ........................................................................................................................ - cmge v13.4S, v31.4S, v18.4S // ................................................................................................................*....... - mls v12.4S, v23.4S, v8.S[0] // ..........................................................................................*............................. - mls v5.4S, v7.4S, v8.S[0] // ..............................................................*......................................................... + cmge v28.4S, v31.4S, v27.4S // ........................................................................e............................................... + sqrdmulh v12.4S, v12.4S, v26.4S // ..................................................................................................e..................... + cmge v21.4S, v23.4S, v30.4S // .....................................................................................................*.................. + sqrdmulh v20.4S, v14.4S, v26.4S // ...............................................................................................e........................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v23.4S, v14.4S, v30.4S // .....................................................................*.................................................. - mls v9.4S, v17.4S, v8.S[0] // ...................................................................*.................................................... + mls v24.4S, v6.4S, v8.S[0] // .............................................................................................e.......................... + mls v7.4S, v19.4S, v8.S[0] // ...................................................................e.................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v7.4S, v13.4S, v28.4S // ..................................................................................................................*..... + mls v13.4S, v12.4S, v8.S[0] // ...................................................................................................e.................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v28.4S, v11.4S, v4.4S // ..........................................................................*............................................. - sub v23.4S, v10.4S, v23.4S // ......................................................................*................................................. + cmge v29.4S, v31.4S, v10.4S // ....................................................................e................................................... + sub v18.4S, v16.4S, v21.4S // ......................................................................................................*................. + mls v9.4S, v20.4S, v8.S[0] // ................................................................................................e....................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v13.4S, v31.4S, v5.4S // ............................................................................*........................................... - cmge v4.4S, v31.4S, v12.4S // ....................................................................................................*................... - cmge v21.4S, v31.4S, v9.4S // ................................................................................*....................................... - mls v18.4S, v7.4S, v29.4S // ...................................................................................................................*.... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v17.4S, v5.4S, v30.4S // .............................................................................*.......................................... - cmge v7.4S, v9.4S, v30.4S // .................................................................................*...................................... + cmge v15.4S, v31.4S, v7.4S // ................................................................................e....................................... + cmge v21.4S, v7.4S, v30.4S // .................................................................................e...................................... // gap // ........................................................................................................................ + cmge v5.4S, v31.4S, v13.4S // ................................................................................................................e....... + cmge v12.4S, v13.4S, v30.4S // .................................................................................................................e...... // gap // ........................................................................................................................ - cmge v11.4S, v12.4S, v30.4S // .....................................................................................................*.................. + mls v23.4S, v18.4S, v8.4S // .......................................................................................................*................ // gap // ........................................................................................................................ - mls v14.4S, v23.4S, v29.4S // .......................................................................*................................................ // gap // ........................................................................................................................ - str q18, [x0, #384] // .......................................................................................................................* - ldr q18, [x0, #272] // ..e..................................................................................................................... - sub v17.4S, v13.4S, v17.4S // ..............................................................................*......................................... - mls v19.4S, v28.4S, v29.4S // ...........................................................................*............................................ - sub v28.4S, v21.4S, v7.4S // ..................................................................................*..................................... - mls v15.4S, v24.4S, v29.4S // ...........................................................................................................*............ + sub v15.4S, v15.4S, v21.4S // ..................................................................................e..................................... + cmge v14.4S, v24.4S, v30.4S // .........................................................................................................e.............. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v21.4S, v4.4S, v11.4S // ......................................................................................................*................. + sub v21.4S, v5.4S, v12.4S // ..................................................................................................................e..... + cmge v22.4S, v31.4S, v9.4S // ............................................................................................................e........... // gap // ........................................................................................................................ - str q14, [x0, #512] // ....................................................................................*................................... - mls v5.4S, v17.4S, v29.4S // ...............................................................................*........................................ - mls v9.4S, v28.4S, v29.4S // ...................................................................................*.................................... // gap // ........................................................................................................................ - sub v4.4S, v20.4S, v6.4S // ..................e..................................................................................................... - str q19, [x0, #640] // .....................................................................................*.................................. - mls v12.4S, v21.4S, v29.4S // .......................................................................................................*................ - // gap // ........................................................................................................................ - sub v19.4S, v18.4S, v27.4S // .............e.......................................................................................................... - str q15, [x0, #128] // .....................................................................................................................*.. - str q5, [x0, #768] // ......................................................................................*................................. - // gap // ........................................................................................................................ - mul v28.4S, v4.4S, v2.S[2] // ....................e................................................................................................... - add v7.4S, v22.4S, v16.4S // ........................e............................................................................................... - str q9, [x0, #896] // .......................................................................................*................................ - sqrdmulh v21.4S, v19.4S, v2.S[1] // ................e....................................................................................................... - mul v17.4S, v19.4S, v2.S[0] // ...............e........................................................................................................ - // gap // ........................................................................................................................ - str q12, [x0], #(16) // ....................................................................................................................*... - add v9.4S, v20.4S, v6.4S // ...................e.................................................................................................... - // gap // ........................................................................................................................ - add v6.4S, v18.4S, v27.4S // ..............e......................................................................................................... - - // original source code - // ldr q9, [x0, #0] // ...................................................................................................................|..*.................................................................................................................. - // ldr q10, [x0, #(1*(1024/8))] // ...................................................................................................................|*.................................................................................................................... - // ldr q11, [x0, #(2*(1024/8))] // ............................................................................................e......................|................................................................................................e.................... - // ldr q12, [x0, #(3*(1024/8))] // ..e................................................................................................................|......e.............................................................................................................. - // ldr q13, [x0, #(4*(1024/8))] // ............e......................................................................................................|................e.................................................................................................... - // ldr q14, [x0, #(5*(1024/8))] // ...................................................e...............................................................|.......................................................e............................................................. - // ldr q15, [x0, #(6*(1024/8))] // ........................e..........................................................................................|............................e........................................................................................ - // ldr q16, [x0, #(7*(1024/8))] // e..................................................................................................................|....e................................................................................................................ - // sub v24.4s, v9.4s, v10.4s // ..........*........................................................................................................|..............*...................................................................................................... - // add v9.4s, v9.4s, v10.4s // .......*...........................................................................................................|...........*......................................................................................................... - // mul v10.4s, v24.4s, v1.s[2] // ..............*....................................................................................................|..................*.................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...............*...................................................................................................|...................*................................................................................................. - // mls v10.4s, v24.4s, v8.s[0] // .....................*.............................................................................................|.........................*........................................................................................... - // sub v24.4s, v11.4s, v12.4s // ........................................................................................................e..........|............................................................................................................e........ - // add v11.4s, v11.4s, v12.4s // ..................................................................................................................e|..................................................................................................................... - // mul v12.4s, v24.4s, v2.s[0] // ...............................................................................................................e...|...................................................................................................................e. - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..............................................................................................................e....|..................................................................................................................e.. - // mls v12.4s, v24.4s, v8.s[0] // ...................................................................................................................|...*................................................................................................................. - // sub v24.4s, v13.4s, v14.4s // .....................................................................................................e.............|.........................................................................................................e........... - // add v13.4s, v13.4s, v14.4s // .................................................................................................................e.|..................................................................................................................... - // mul v14.4s, v24.4s, v2.s[2] // ...........................................................................................................e.......|...............................................................................................................e..... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...................................................................................................................*..................................................................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ......*............................................................................................................|..........*.......................................................................................................... - // sub v24.4s, v15.4s, v16.4s // ...................................................................................................................|.*................................................................................................................... - // add v15.4s, v15.4s, v16.4s // ............................................................................................................e......|................................................................................................................e.... - // mul v16.4s, v24.4s, v3.s[0] // .....*.............................................................................................................|.........*........................................................................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...*...............................................................................................................|.......*............................................................................................................. - // mls v16.4s, v24.4s, v8.s[0] // .............*.....................................................................................................|.................*................................................................................................... - // sub v24.4s, v9.4s, v11.4s // .................*.................................................................................................|.....................*............................................................................................... - // add v9.4s, v9.4s, v11.4s // ...........*.......................................................................................................|...............*..................................................................................................... - // mul v11.4s, v24.4s, v0.s[2] // ..........................*........................................................................................|..............................*...................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ....................*..............................................................................................|........................*............................................................................................ - // mls v11.4s, v24.4s, v8.s[0] // ...............................*...................................................................................|...................................*................................................................................. - // sub v24.4s, v10.4s, v12.4s // ................................*..................................................................................|....................................*................................................................................ - // add v10.4s, v10.4s, v12.4s // ..............................*....................................................................................|..................................*.................................................................................. - // mul v12.4s, v24.4s, v0.s[2] // ..........................................*........................................................................|..............................................*...................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................*...........................................................................|...........................................*......................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ................................................*..................................................................|....................................................*................................................................ - // sub v24.4s, v13.4s, v15.4s // .*.................................................................................................................|.....*............................................................................................................... - // add v13.4s, v13.4s, v15.4s // ....*..............................................................................................................|........*............................................................................................................ - // mul v15.4s, v24.4s, v1.s[0] // .........*.........................................................................................................|.............*....................................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........*..........................................................................................................|............*........................................................................................................ - // mls v15.4s, v24.4s, v8.s[0] // ..................................*................................................................................|......................................*.............................................................................. - // sub v24.4s, v14.4s, v16.4s // ...................*...............................................................................................|.......................*............................................................................................. - // add v14.4s, v14.4s, v16.4s // ............................*......................................................................................|................................*.................................................................................... - // mul v16.4s, v24.4s, v1.s[0] // ......................*............................................................................................|..........................*.......................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .......................*...........................................................................................|...........................*......................................................................................... - // mls v16.4s, v24.4s, v8.s[0] // .............................*.....................................................................................|.................................*................................................................................... - // sub v24.4s, v9.4s, v13.4s // ..................*................................................................................................|......................*.............................................................................................. - // add v9.4s, v9.4s, v13.4s // ................*..................................................................................................|....................*................................................................................................ - // mul v13.4s, v24.4s, v0.s[0] // ..................................................*................................................................|......................................................*.............................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................*.........................................................................................|.............................*....................................................................................... - // mls v13.4s, v24.4s, v8.s[0] // ........................................................*..........................................................|............................................................*........................................................ - // sub v24.4s, v10.4s, v14.4s // ...................................*...............................................................................|.......................................*............................................................................. - // add v10.4s, v10.4s, v14.4s // .................................*.................................................................................|.....................................*............................................................................... - // mul v14.4s, v24.4s, v0.s[0] // .........................................*.........................................................................|.............................................*....................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................*...................................................................|...................................................*................................................................. - // mls v14.4s, v24.4s, v8.s[0] // ..............................................................*....................................................|..................................................................*.................................................. - // sub v24.4s, v11.4s, v15.4s // .............................................*.....................................................................|.................................................*................................................................... - // add v11.4s, v11.4s, v15.4s // ........................................*..........................................................................|............................................*........................................................................ - // mul v15.4s, v24.4s, v0.s[0] // .........................................................*.........................................................|.............................................................*....................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................*............................................|..........................................................................*.......................................... - // mls v15.4s, v24.4s, v8.s[0] // .............................................................................*.....................................|.................................................................................*................................... - // sub v24.4s, v12.4s, v16.4s // .......................................................*...........................................................|...........................................................*......................................................... - // add v12.4s, v12.4s, v16.4s // ......................................................*............................................................|..........................................................*.......................................................... - // mul v16.4s, v24.4s, v0.s[0] // ...............................................................*...................................................|...................................................................*................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................*................................................|......................................................................*.............................................. - // mls v16.4s, v24.4s, v8.s[0] // ...............................................................................*...................................|...................................................................................*................................. - // cmge v27.4s, v31.4s, v13.4s // .....................................................................*.............................................|.........................................................................*........................................... - // cmge v28.4s, v13.4s, v30.4s // ..............................................................................*....................................|..................................................................................*.................................. - // sub v28.4s, v27.4s, v28.4s // ..................................................................................*................................|......................................................................................*.............................. - // mls v13.4s, v28.4s, v29.4s // ..........................................................................................*........................|..............................................................................................*...................... - // cmge v27.4s, v31.4s, v14.4s // ........................................................................*..........................................|............................................................................*........................................ - // cmge v28.4s, v14.4s, v30.4s // .......................................................................*...........................................|...........................................................................*......................................... - // sub v28.4s, v27.4s, v28.4s // .................................................................................*.................................|.....................................................................................*............................... - // mls v14.4s, v28.4s, v29.4s // ..............................................................................................*....................|..................................................................................................*.................. - // cmge v27.4s, v31.4s, v15.4s // ...................................................................................*...............................|.......................................................................................*............................. - // cmge v28.4s, v15.4s, v30.4s // .......................................................................................*...........................|...........................................................................................*......................... - // sub v28.4s, v27.4s, v28.4s // .............................................................................................*.....................|.................................................................................................*................... - // mls v15.4s, v28.4s, v29.4s // ...................................................................................................*...............|.......................................................................................................*............. - // cmge v27.4s, v31.4s, v16.4s // .....................................................................................*.............................|.........................................................................................*........................... - // cmge v28.4s, v16.4s, v30.4s // ........................................................................................*..........................|............................................................................................*........................ - // sub v28.4s, v27.4s, v28.4s // ...............................................................................................*...................|...................................................................................................*................. - // mls v16.4s, v28.4s, v29.4s // ....................................................................................................*..............|........................................................................................................*............ - // str q13, [x0, #(4*(1024/8))] // ..................................................................................................*................|......................................................................................................*.............. - // str q14, [x0, #(5*(1024/8))] // ......................................................................................................*............|..........................................................................................................*.......... - // str q15, [x0, #(6*(1024/8))] // ..........................................................................................................*........|..............................................................................................................*...... - // str q16, [x0, #(7*(1024/8))] // .............................................................................................................*.....|.................................................................................................................*... - // mul v13.4s, v9.4s, v25.4s // ....................................*..............................................................................|........................................*............................................................................ - // sqrdmulh v9.4s, v9.4s, v26.4s // ...........................*.......................................................................................|...............................*..................................................................................... - // mls v13.4s, v9.4s, v8.s[0] // ............................................................................*......................................|................................................................................*.................................... - // mul v14.4s, v10.4s, v25.4s // .....................................*.............................................................................|.........................................*........................................................................... - // sqrdmulh v10.4s, v10.4s, v26.4s // ......................................*............................................................................|..........................................*.......................................................................... - // mls v14.4s, v10.4s, v8.s[0] // ............................................*......................................................................|................................................*.................................................................... - // mul v15.4s, v11.4s, v25.4s // ..............................................*....................................................................|..................................................*.................................................................. - // sqrdmulh v11.4s, v11.4s, v26.4s // ...........................................*.......................................................................|...............................................*..................................................................... - // mls v15.4s, v11.4s, v8.s[0] // ....................................................*..............................................................|........................................................*............................................................ - // mul v16.4s, v12.4s, v25.4s // .............................................................*.....................................................|.................................................................*................................................... - // sqrdmulh v12.4s, v12.4s, v26.4s // ...........................................................*.......................................................|...............................................................*..................................................... - // mls v16.4s, v12.4s, v8.s[0] // ...................................................................*...............................................|.......................................................................*............................................. - // cmge v27.4s, v31.4s, v13.4s // ....................................................................................*..............................|........................................................................................*............................ - // cmge v28.4s, v13.4s, v30.4s // .........................................................................................*.........................|.............................................................................................*....................... - // sub v28.4s, v27.4s, v28.4s // .................................................................................................*.................|.....................................................................................................*............... - // mls v13.4s, v28.4s, v29.4s // .......................................................................................................*...........|...........................................................................................................*......... - // cmge v27.4s, v31.4s, v14.4s // .................................................*.................................................................|.....................................................*............................................................... - // cmge v28.4s, v14.4s, v30.4s // .....................................................*.............................................................|.........................................................*........................................................... - // sub v28.4s, v27.4s, v28.4s // .................................................................*.................................................|.....................................................................*............................................... - // mls v14.4s, v28.4s, v29.4s // ................................................................................................*..................|....................................................................................................*................ - // cmge v27.4s, v31.4s, v15.4s // ............................................................*......................................................|................................................................*.................................................... - // cmge v28.4s, v15.4s, v30.4s // ..........................................................*........................................................|..............................................................*...................................................... - // sub v28.4s, v27.4s, v28.4s // ................................................................*..................................................|....................................................................*................................................ - // mls v15.4s, v28.4s, v29.4s // ....................................................................*..............................................|........................................................................*............................................ - // cmge v27.4s, v31.4s, v16.4s // ...........................................................................*.......................................|...............................................................................*..................................... - // cmge v28.4s, v16.4s, v30.4s // .........................................................................*.........................................|.............................................................................*....................................... - // sub v28.4s, v27.4s, v28.4s // ................................................................................*..................................|....................................................................................*................................ - // mls v16.4s, v28.4s, v29.4s // ......................................................................................*............................|..........................................................................................*.......................... - // str q13, [x0], #(16) // ................................................................................................................*..|....................................................................................................................* - // str q14, [x0, #(-16 + 1*(1024/8))] // .........................................................................................................*.........|.............................................................................................................*....... - // str q15, [x0, #(-16 + 2*(1024/8))] // ..........................................................................*........................................|..............................................................................*...................................... - // str q16, [x0, #(-16 + 3*(1024/8))] // ...........................................................................................*.......................|...............................................................................................*..................... + mls v7.4S, v15.4S, v8.4S // ...................................................................................e.................................... + mls v13.4S, v21.4S, v8.4S // ...................................................................................................................e.... + str q23, [x0], #(16) // ....................................................................................................................*... + mul v23.4S, v17.4S, v25.4S // ........................................................................................e............................... + // gap // ........................................................................................................................ + + // --------------------------------------------------------------------------------------------------------------- new position ---------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------- + // ldr q9, [x0, #0] // ............e.........................................................................................................'.............~........................................................................................................ + // ldr q10, [x0, #(1*(1024/8))] // ...........e..........................................................................................................'............~......................................................................................................... + // ldr q11, [x0, #(2*(1024/8))] // ..e...................................................................................................................'...~.................................................................................................................. + // ldr q12, [x0, #(3*(1024/8))] // e.....................................................................................................................'.~.................................................................................................................... + // ldr q13, [x0, #(4*(1024/8))] // .................e....................................................................................................'..................~................................................................................................... + // ldr q14, [x0, #(5*(1024/8))] // .....e................................................................................................................'......~............................................................................................................... + // ldr q15, [x0, #(6*(1024/8))] // .e....................................................................................................................'..~................................................................................................................... + // ldr q16, [x0, #(7*(1024/8))] // ......e...............................................................................................................'.......~.............................................................................................................. + // sub v24.4s, v9.4s, v10.4s // ......................e...............................................................................................'.......................~.............................................................................................. + // add v9.4s, v9.4s, v10.4s // .......................e..............................................................................................'........................~............................................................................................. + // mul v10.4s, v24.4s, v1.s[2] // .......................................e..............................................................................'........................................~............................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...........................e..........................................................................................'............................~......................................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ..............................................e.......................................................................'...............................................~...................................................................... + // sub v24.4s, v11.4s, v12.4s // ...............e......................................................................................................'................~..................................................................................................... + // add v11.4s, v11.4s, v12.4s // .....................e................................................................................................'......................~............................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ....................e.................................................................................................'.....................~................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...............................................e......................................................................'................................................~..................................................................... + // mls v12.4s, v24.4s, v8.s[0] // .......................................................e..............................................................'........................................................~............................................................. + // sub v24.4s, v13.4s, v14.4s // ........................e.............................................................................................'.........................~............................................................................................ + // add v13.4s, v13.4s, v14.4s // .........................e............................................................................................'..........................~........................................................................................... + // mul v14.4s, v24.4s, v2.s[2] // ..........................................e...........................................................................'...........................................~.......................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ........................................e.............................................................................'.........................................~............................................................................ + // mls v14.4s, v24.4s, v8.s[0] // ........................................................e.............................................................'.........................................................~............................................................ + // sub v24.4s, v15.4s, v16.4s // ............................................e.........................................................................'.............................................~........................................................................ + // add v15.4s, v15.4s, v16.4s // ...................e..................................................................................................'....................~................................................................................................. + // mul v16.4s, v24.4s, v3.s[0] // ...................................................e..................................................................'....................................................~................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ....................................................e.................................................................'.....................................................~................................................................ + // mls v16.4s, v24.4s, v8.s[0] // .........................................................e............................................................'..........................................................~........................................................... + // sub v24.4s, v9.4s, v11.4s // .............................e........................................................................................'..............................~....................................................................................... + // add v9.4s, v9.4s, v11.4s // ......................................e...............................................................................'.......................................~.............................................................................. + // mul v11.4s, v24.4s, v0.s[2] // .................................e....................................................................................'..................................~................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................e...................................................................................'...................................~.................................................................................. + // mls v11.4s, v24.4s, v8.s[0] // .........................................e............................................................................'..........................................~........................................................................... + // sub v24.4s, v10.4s, v12.4s // .............................................................e........................................................'..............................................................~....................................................... + // add v10.4s, v10.4s, v12.4s // ..............................................................e.......................................................'...............................................................~...................................................... + // mul v12.4s, v24.4s, v0.s[2] // ...................................................................e..................................................'....................................................................~................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....................................................................e................................................'......................................................................~............................................... + // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................e..........................................'............................................................................~......................................... + // sub v24.4s, v13.4s, v15.4s // ................................e.....................................................................................'.................................~.................................................................................... + // add v13.4s, v13.4s, v15.4s // ...............................e......................................................................................'................................~..................................................................................... + // mul v15.4s, v24.4s, v1.s[0] // .....................................e................................................................................'......................................~............................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................e..................................................................................'....................................~................................................................................. + // mls v15.4s, v24.4s, v8.s[0] // ...........................................e..........................................................................'............................................~......................................................................... + // sub v24.4s, v14.4s, v16.4s // ....................................................................e.................................................'.....................................................................~................................................ + // add v14.4s, v14.4s, v16.4s // ...............................................................e......................................................'................................................................~..................................................... + // mul v16.4s, v24.4s, v1.s[0] // .......................................................................e..............................................'........................................................................~............................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................................................................e.............................................'.........................................................................~............................................ + // mls v16.4s, v24.4s, v8.s[0] // ..............................................................................e.......................................'...............................................................................~...................................... + // sub v24.4s, v9.4s, v13.4s // .............................................e........................................................................'..............................................~....................................................................... + // add v9.4s, v9.4s, v13.4s // ................................................e.....................................................................'.................................................~.................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ............................................................e.........................................................'.............................................................~........................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................................e.....................................................'.................................................................~.................................................... + // mls v13.4s, v24.4s, v8.s[0] // ..................................................................................e...................................'...................................................................................~.................................. + // sub v24.4s, v10.4s, v14.4s // ..........................................................................e...........................................'...........................................................................~.......................................... + // add v10.4s, v10.4s, v14.4s // ............................................................................e.........................................'.............................................................................~........................................ + // mul v14.4s, v24.4s, v0.s[0] // ................................................................................e.....................................'.................................................................................~.................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................................e........................................'..............................................................................~....................................... + // mls v14.4s, v24.4s, v8.s[0] // .......................................................................................e..............................'........................................................................................~............................. + // sub v24.4s, v11.4s, v15.4s // ..................................................e...................................................................'...................................................~.................................................................. + // add v11.4s, v11.4s, v15.4s // .........................................................................................e............................'..........................................................................................~........................... + // mul v15.4s, v24.4s, v0.s[0] // .....................................................e................................................................'......................................................~............................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................e...............................................................'.......................................................~.............................................................. + // mls v15.4s, v24.4s, v8.s[0] // ...........................................................e..........................................................'............................................................~......................................................... + // sub v24.4s, v12.4s, v16.4s // .....................................................................................e................................'......................................................................................~............................... + // add v12.4s, v12.4s, v16.4s // ......................................................................................e...............................'.......................................................................................~.............................. + // mul v16.4s, v24.4s, v0.s[0] // .............................................................................................e........................'..............................................................................................~....................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........................................................................................e...........................'...........................................................................................~.......................... + // mls v16.4s, v24.4s, v8.s[0] // ....................................................................................................e.................'.....................................................................................................~................ + // cmge v27.4s, v31.4s, v13.4s // ......................................................................................................e...............'.......................................................................................................~.............. + // cmge v28.4s, v13.4s, v30.4s // ......................................................................................................................'*..................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .......~..............................................................................................................'........*............................................................................................................. + // mls v13.4s, v28.4s, v8.4s // ..........................~...........................................................................................'...........................*.......................................................................................... + // cmge v27.4s, v31.4s, v14.4s // ...............................................................................................e......................'................................................................................................~..................... + // cmge v28.4s, v14.4s, v30.4s // ....~.................................................................................................................'.....*................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // .............~........................................................................................................'..............*....................................................................................................... + // mls v14.4s, v28.4s, v8.4s // ............................~.........................................................................................'.............................*........................................................................................ + // cmge v27.4s, v31.4s, v15.4s // ..................................................................e...................................................'...................................................................~.................................................. + // cmge v28.4s, v15.4s, v30.4s // .................................................................e....................................................'..................................................................~................................................... + // sub v28.4s, v27.4s, v28.4s // ......................................................................e...............................................'.......................................................................~.............................................. + // mls v15.4s, v28.4s, v8.4s // .........................................................................e............................................'..........................................................................~........................................... + // cmge v27.4s, v31.4s, v16.4s // .........................................................................................................e............'..........................................................................................................~........... + // cmge v28.4s, v16.4s, v30.4s // ..........................................................................................................e...........'...........................................................................................................~.......... + // sub v28.4s, v27.4s, v28.4s // ..............................................................................................................e.......'...............................................................................................................~...... + // mls v16.4s, v28.4s, v8.4s // ..................................................................................................................e...'...................................................................................................................~.. + // str q13, [x0, #(4*(1024/8))] // .................................................~....................................................................'..................................................*................................................................... + // str q14, [x0, #(5*(1024/8))] // ....................................~.................................................................................'.....................................*................................................................................ + // str q15, [x0, #(6*(1024/8))] // ...............................................................................e......................................'................................................................................~..................................... + // str q16, [x0, #(7*(1024/8))] // ..............................~.......................................................................................'...............................*...................................................................................... + // mul v13.4s, v9.4s, v25.4s // .....................................................................................................................e'...................................................................................................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ......................................................................................................................*...................................................................................................................... + // mls v13.4s, v9.4s, v8.s[0] // .........~............................................................................................................'..........*........................................................................................................... + // mul v14.4s, v10.4s, v25.4s // ....................................................................................e.................................'.....................................................................................~................................ + // sqrdmulh v10.4s, v10.4s, v26.4s // .................................................................................e....................................'..................................................................................~................................... + // mls v14.4s, v10.4s, v8.s[0] // ...................................................................................................e..................'....................................................................................................~................. + // mul v15.4s, v11.4s, v25.4s // ..............................................................................................e.......................'...............................................................................................~...................... + // sqrdmulh v11.4s, v11.4s, v26.4s // ..................................................................................................e...................'...................................................................................................~.................. + // mls v15.4s, v11.4s, v8.s[0] // ........................................................................................................e.............'.........................................................................................................~............ + // mul v16.4s, v12.4s, v25.4s // ...........................................................................................e..........................'............................................................................................~......................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ................................................................................................e.....................'.................................................................................................~.................... + // mls v16.4s, v12.4s, v8.s[0] // .....................................................................................................e................'......................................................................................................~............... + // cmge v27.4s, v31.4s, v13.4s // ..................~...................................................................................................'...................*.................................................................................................. + // cmge v28.4s, v13.4s, v30.4s // .................................................................................................~....................'..................................................................................................*................... + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................~..............'........................................................................................................*............. + // mls v13.4s, v28.4s, v8.4s // .............................................................................................................~........'..............................................................................................................*....... + // cmge v27.4s, v31.4s, v14.4s // ........~.............................................................................................................'.........*............................................................................................................ + // cmge v28.4s, v14.4s, v30.4s // ...............................................................................................................e......'................................................................................................................~..... + // sub v28.4s, v27.4s, v28.4s // ..............~.......................................................................................................'...............*...................................................................................................... + // mls v14.4s, v28.4s, v8.4s // ..........................................................~...........................................................'...........................................................*.......................................................... + // cmge v27.4s, v31.4s, v15.4s // .................................................................................................................e....'..................................................................................................................~... + // cmge v28.4s, v15.4s, v30.4s // ...~..................................................................................................................'....*................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ..........~...........................................................................................................'...........*.......................................................................................................... + // mls v15.4s, v28.4s, v8.4s // ................~.....................................................................................................'.................*.................................................................................................... + // cmge v27.4s, v31.4s, v16.4s // ...........................................................................................................e..........'............................................................................................................~......... + // cmge v28.4s, v16.4s, v30.4s // ............................................................................................................e.........'.............................................................................................................~........ + // sub v28.4s, v27.4s, v28.4s // ................................................................................................................e.....'.................................................................................................................~.... + // mls v16.4s, v28.4s, v8.4s // ...................................................................................................................e..'....................................................................................................................~. + // str q13, [x0], #(16) // ....................................................................................................................~.'.....................................................................................................................* + // str q14, [x0, #(-16 + 1*(1024/8))] // ...................................................................................~..................................'....................................................................................*................................. + // str q15, [x0, #(-16 + 2*(1024/8))] // ............................................................................................~.........................'.............................................................................................*........................ + // str q16, [x0, #(-16 + 3*(1024/8))] // ........................................................................................~.............................'.........................................................................................*............................ sub count, count, #1 cbnz count, layer123_start - sqrdmulh v15.4S, v4.4S, v2.S[3] // *......................................................................................................... - ldr q13, [x0, #128] // .*........................................................................................................ - sub v16.4S, v22.4S, v16.4S // ..*....................................................................................................... - ldr q19, [x0, #0] // ...*...................................................................................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - mls v17.4S, v21.4S, v8.S[0] // ....*..................................................................................................... - add v12.4S, v9.4S, v7.4S // .......*.................................................................................................. - sqrdmulh v18.4S, v16.4S, v3.S[1] // ......*................................................................................................... - mul v4.4S, v16.4S, v3.S[0] // ........*................................................................................................. - // gap // .......................................................................................................... - // gap // .......................................................................................................... - mls v28.4S, v15.4S, v8.S[0] // .........*................................................................................................ - sub v11.4S, v9.4S, v7.4S // .....*.................................................................................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - add v23.4S, v19.4S, v13.4S // ..........*............................................................................................... - sub v22.4S, v19.4S, v13.4S // .............*............................................................................................ - mul v7.4S, v11.4S, v1.S[0] // ............*............................................................................................. - // gap // .......................................................................................................... - // gap // .......................................................................................................... - mls v4.4S, v18.4S, v8.S[0] // ...............*.......................................................................................... - mul v14.4S, v22.4S, v1.S[2] // ................*......................................................................................... - // gap // .......................................................................................................... - sqrdmulh v21.4S, v22.4S, v1.S[3] // .................*........................................................................................ - // gap // .......................................................................................................... - add v24.4S, v23.4S, v6.4S // ..............*........................................................................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - sub v5.4S, v23.4S, v6.4S // ...................*...................................................................................... - // gap // .......................................................................................................... - sqrdmulh v9.4S, v11.4S, v1.S[1] // ...........*.............................................................................................. - // gap // .......................................................................................................... - sub v27.4S, v28.4S, v4.4S // .....................*.................................................................................... - sqrdmulh v23.4S, v5.4S, v0.S[3] // ......................*................................................................................... - // gap // .......................................................................................................... - mls v14.4S, v21.4S, v8.S[0] // .......................*.................................................................................. - // gap // .......................................................................................................... - // gap // .......................................................................................................... - mul v5.4S, v5.4S, v0.S[2] // ...........................*.............................................................................. - // gap // .......................................................................................................... - sqrdmulh v20.4S, v27.4S, v1.S[1] // .........................*................................................................................ - // gap // .......................................................................................................... - mls v7.4S, v9.4S, v8.S[0] // ...................................*...................................................................... - mul v19.4S, v27.4S, v1.S[0] // ........................*................................................................................. - // gap // .......................................................................................................... - // gap // .......................................................................................................... - sub v16.4S, v14.4S, v17.4S // .................................*........................................................................ - // gap // .......................................................................................................... - add v18.4S, v24.4S, v12.4S // ..................*....................................................................................... - add v21.4S, v14.4S, v17.4S // ...............................*.......................................................................... - mls v5.4S, v23.4S, v8.S[0] // ................................*......................................................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - mul v23.4S, v16.4S, v0.S[2] // ...........................................*.............................................................. - // gap // .......................................................................................................... - sqrdmulh v15.4S, v16.4S, v0.S[3] // ........................................*................................................................. - mls v19.4S, v20.4S, v8.S[0] // ..............................*........................................................................... - // gap // .......................................................................................................... - add v28.4S, v28.4S, v4.4S // .............................*............................................................................ - // gap // .......................................................................................................... - add v4.4S, v5.4S, v7.4S // .........................................*................................................................ - sub v9.4S, v5.4S, v7.4S // ..............................................*........................................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - mls v23.4S, v15.4S, v8.S[0] // .................................................*........................................................ - // gap // .......................................................................................................... - // gap // .......................................................................................................... - add v13.4S, v21.4S, v28.4S // ..................................*....................................................................... - mul v15.4S, v9.4S, v0.S[0] // .........................................................*................................................ - sqrdmulh v5.4S, v9.4S, v0.S[1] // ......................................................................*................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - sub v16.4S, v21.4S, v28.4S // ....................................*..................................................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - sqrdmulh v10.4S, v4.4S, v26.4S // ............................................*............................................................. - sub v9.4S, v23.4S, v19.4S // .......................................................*.................................................. - sqrdmulh v17.4S, v18.4S, v26.4S // ............................*............................................................................. - // gap // .......................................................................................................... - // gap // .......................................................................................................... - mls v15.4S, v5.4S, v8.S[0] // .............................................................................*............................ - mul v7.4S, v18.4S, v25.4S // .....................................*.................................................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - sqrdmulh v5.4S, v9.4S, v0.S[1] // ..................................................................*....................................... - // gap // .......................................................................................................... - mul v27.4S, v9.4S, v0.S[0] // ...............................................................*.......................................... - // gap // .......................................................................................................... - sqrdmulh v28.4S, v16.4S, v0.S[1] // ................................................*......................................................... - // gap // .......................................................................................................... - mul v22.4S, v16.4S, v0.S[0] // ..........................................*............................................................... - sub v18.4S, v24.4S, v12.4S // ....................*..................................................................................... - // gap // .......................................................................................................... - mls v7.4S, v17.4S, v8.S[0] // ............................................................................*............................. - // gap // .......................................................................................................... - // gap // .......................................................................................................... - mul v21.4S, v13.4S, v25.4S // ......................................*................................................................... - // gap // .......................................................................................................... - mls v27.4S, v5.4S, v8.S[0] // ...............................................................................*.......................... - mul v17.4S, v18.4S, v0.S[0] // ...................................................*...................................................... - mls v22.4S, v28.4S, v8.S[0] // ..............................................................*........................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - sqrdmulh v16.4S, v13.4S, v26.4S // .......................................*.................................................................. - sqrdmulh v24.4S, v18.4S, v0.S[1] // ..........................*............................................................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - cmge v11.4S, v27.4S, v30.4S // ........................................................................................*................. - // gap // .......................................................................................................... - cmge v5.4S, v7.4S, v30.4S // .........................................................................................*................ - cmge v9.4S, v31.4S, v27.4S // .....................................................................................*.................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - cmge v12.4S, v22.4S, v30.4S // .......................................................................*.................................. - cmge v13.4S, v31.4S, v22.4S // ........................................................................*................................. - cmge v20.4S, v31.4S, v7.4S // ....................................................................................*..................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - sub v9.4S, v9.4S, v11.4S // ..............................................................................................*........... - mls v21.4S, v16.4S, v8.S[0] // .............................................*............................................................ - // gap // .......................................................................................................... - // gap // .......................................................................................................... - mul v6.4S, v4.4S, v25.4S // ...............................................*.......................................................... - sub v5.4S, v20.4S, v5.4S // ................................................................................................*......... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - sub v12.4S, v13.4S, v12.4S // .................................................................................*........................ - mls v27.4S, v9.4S, v29.4S // ...................................................................................................*...... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - mls v17.4S, v24.4S, v8.S[0] // ........................................................*................................................. - mls v7.4S, v5.4S, v29.4S // .....................................................................................................*.... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - cmge v14.4S, v21.4S, v30.4S // .....................................................*.................................................... - mls v22.4S, v12.4S, v29.4S // .............................................................................................*............ - // gap // .......................................................................................................... - // gap // .......................................................................................................... - str q27, [x0, #896] // ........................................................................................................*. - add v19.4S, v23.4S, v19.4S // ......................................................*................................................... - mls v6.4S, v10.4S, v8.S[0] // ....................................................*..................................................... - // gap // .......................................................................................................... - str q7, [x0], #(16) // .........................................................................................................* - // gap // .......................................................................................................... - cmge v20.4S, v31.4S, v21.4S // ..................................................*....................................................... - cmge v12.4S, v31.4S, v17.4S // .....................................................................*.................................... - mul v16.4S, v19.4S, v25.4S // .............................................................*............................................ - str q22, [x0, #624] // ....................................................................................................*..... - // gap // .......................................................................................................... - sqrdmulh v13.4S, v19.4S, v26.4S // ...........................................................*.............................................. - // gap // .......................................................................................................... - cmge v27.4S, v31.4S, v6.4S // ............................................................*............................................. - // gap // .......................................................................................................... - cmge v10.4S, v6.4S, v30.4S // ..........................................................*............................................... - // gap // .......................................................................................................... - cmge v9.4S, v17.4S, v30.4S // ..............................................................................*........................... - // gap // .......................................................................................................... - cmge v5.4S, v15.4S, v30.4S // .......................................................................................*.................. - mls v16.4S, v13.4S, v8.S[0] // ...................................................................*...................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - sub v19.4S, v27.4S, v10.4S // ................................................................*......................................... - cmge v11.4S, v31.4S, v15.4S // ...................................................................................*...................... - sub v13.4S, v12.4S, v9.4S // ..................................................................................*....................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - sub v27.4S, v20.4S, v14.4S // .................................................................*........................................ - // gap // .......................................................................................................... - // gap // .......................................................................................................... - mls v6.4S, v19.4S, v29.4S // ....................................................................*..................................... - cmge v9.4S, v31.4S, v16.4S // ...........................................................................*.............................. - cmge v18.4S, v16.4S, v30.4S // .........................................................................*................................ - // gap // .......................................................................................................... - // gap // .......................................................................................................... - sub v12.4S, v11.4S, v5.4S // ............................................................................................*............. - mls v17.4S, v13.4S, v29.4S // ..........................................................................................*............... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - mls v21.4S, v27.4S, v29.4S // ...............................................................................................*.......... - sub v28.4S, v9.4S, v18.4S // ................................................................................*......................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - mls v15.4S, v12.4S, v29.4S // ..................................................................................................*....... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - str q6, [x0, #240] // ..........................................................................*............................... - str q17, [x0, #496] // .................................................................................................*........ - mls v16.4S, v28.4S, v29.4S // ......................................................................................*................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - str q21, [x0, #112] // ......................................................................................................*... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - str q15, [x0, #752] // .......................................................................................................*.. - // gap // .......................................................................................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - str q16, [x0, #368] // ...........................................................................................*.............. - // gap // .......................................................................................................... - // gap // .......................................................................................................... - // gap // .......................................................................................................... - - // original source code - // sqrdmulh v19.4S, v4.4S, v2.S[3] // *......................................................................................................... - // ldr q20, [x0, #128] // .*........................................................................................................ - // sub v23.4S, v22.4S, v16.4S // ..*....................................................................................................... - // ldr q4, [x0, #0] // ...*...................................................................................................... - // mls v17.4S, v21.4S, v8.S[0] // ....*..................................................................................................... - // sub v10.4S, v9.4S, v7.4S // .........*................................................................................................ - // sqrdmulh v5.4S, v23.4S, v3.S[1] // ......*................................................................................................... - // add v24.4S, v9.4S, v7.4S // .....*.................................................................................................... - // mul v15.4S, v23.4S, v3.S[0] // .......*.................................................................................................. - // mls v28.4S, v19.4S, v8.S[0] // ........*................................................................................................. - // add v19.4S, v4.4S, v20.4S // ..........*............................................................................................... - // sqrdmulh v11.4S, v10.4S, v1.S[1] // ..................*....................................................................................... - // mul v13.4S, v10.4S, v1.S[0] // ............*............................................................................................. - // sub v22.4S, v4.4S, v20.4S // ...........*.............................................................................................. - // add v4.4S, v19.4S, v6.4S // ................*......................................................................................... - // mls v15.4S, v5.4S, v8.S[0] // .............*............................................................................................ - // mul v12.4S, v22.4S, v1.S[2] // ..............*........................................................................................... - // sqrdmulh v10.4S, v22.4S, v1.S[3] // ...............*.......................................................................................... - // add v9.4S, v4.4S, v24.4S // ...........................*.............................................................................. - // sub v7.4S, v19.4S, v6.4S // .................*........................................................................................ - // sub v14.4S, v4.4S, v24.4S // ..................................................*....................................................... - // sub v6.4S, v28.4S, v15.4S // ...................*...................................................................................... - // sqrdmulh v21.4S, v7.4S, v0.S[3] // ....................*..................................................................................... - // mls v12.4S, v10.4S, v8.S[0] // .....................*.................................................................................... - // mul v18.4S, v6.4S, v1.S[0] // .........................*................................................................................ - // sqrdmulh v4.4S, v6.4S, v1.S[1] // .......................*.................................................................................. - // sqrdmulh v10.4S, v14.4S, v0.S[1] // .........................................................*................................................ - // mul v5.4S, v7.4S, v0.S[2] // ......................*................................................................................... - // sqrdmulh v23.4S, v9.4S, v26.4S // ...........................................*.............................................................. - // add v24.4S, v28.4S, v15.4S // .................................*........................................................................ - // mls v18.4S, v4.4S, v8.S[0] // ................................*......................................................................... - // add v6.4S, v12.4S, v17.4S // ............................*............................................................................. - // mls v5.4S, v21.4S, v8.S[0] // .............................*............................................................................ - // sub v4.4S, v12.4S, v17.4S // ..........................*............................................................................... - // add v19.4S, v6.4S, v24.4S // .....................................*.................................................................... - // mls v13.4S, v11.4S, v8.S[0] // ........................*................................................................................. - // sub v7.4S, v6.4S, v24.4S // ........................................*................................................................. - // mul v12.4S, v9.4S, v25.4S // .............................................*............................................................ - // mul v15.4S, v19.4S, v25.4S // ....................................................*..................................................... - // sqrdmulh v24.4S, v19.4S, v26.4S // ........................................................*................................................. - // sqrdmulh v6.4S, v4.4S, v0.S[3] // ...............................*.......................................................................... - // add v9.4S, v5.4S, v13.4S // ..................................*....................................................................... - // mul v19.4S, v7.4S, v0.S[0] // .................................................*........................................................ - // mul v28.4S, v4.4S, v0.S[2] // ..............................*........................................................................... - // sqrdmulh v4.4S, v9.4S, v26.4S // .........................................*................................................................ - // mls v15.4S, v24.4S, v8.S[0] // .................................................................*........................................ - // sub v21.4S, v5.4S, v13.4S // ...................................*...................................................................... - // mul v13.4S, v9.4S, v25.4S // ..................................................................*....................................... - // sqrdmulh v9.4S, v7.4S, v0.S[1] // ................................................*......................................................... - // mls v28.4S, v6.4S, v8.S[0] // ....................................*..................................................................... - // cmge v24.4S, v31.4S, v15.4S // ..............................................................................*........................... - // mul v14.4S, v14.4S, v0.S[0] // ......................................................*................................................... - // mls v13.4S, v4.4S, v8.S[0] // ............................................................................*............................. - // cmge v7.4S, v15.4S, v30.4S // ........................................................................*................................. - // add v11.4S, v28.4S, v18.4S // ...........................................................................*.............................. - // sub v4.4S, v28.4S, v18.4S // ..........................................*............................................................... - // mls v14.4S, v10.4S, v8.S[0] // ......................................................................*................................... - // mul v5.4S, v21.4S, v0.S[0] // ......................................*................................................................... - // cmge v28.4S, v13.4S, v30.4S // ....................................................................................*..................... - // sqrdmulh v10.4S, v11.4S, v26.4S // ..................................................................................*....................... - // cmge v17.4S, v31.4S, v13.4S // ...................................................................................*...................... - // mul v18.4S, v11.4S, v25.4S // ................................................................................*......................... - // mls v19.4S, v9.4S, v8.S[0] // .......................................................*.................................................. - // mul v9.4S, v4.4S, v0.S[0] // ...............................................*.......................................................... - // sub v28.4S, v17.4S, v28.4S // ........................................................................................*................. - // sub v24.4S, v24.4S, v7.4S // ...........................................................................................*.............. - // sqrdmulh v17.4S, v4.4S, v0.S[1] // ..............................................*........................................................... - // mls v18.4S, v10.4S, v8.S[0] // .......................................................................................*.................. - // mls v13.4S, v28.4S, v29.4S // ............................................................................................*............. - // cmge v10.4S, v31.4S, v14.4S // ...............................................................................*.......................... - // sqrdmulh v7.4S, v21.4S, v0.S[1] // .......................................*.................................................................. - // cmge v4.4S, v19.4S, v30.4S // .............................................................*............................................ - // cmge v11.4S, v31.4S, v19.4S // ..............................................................*........................................... - // cmge v28.4S, v18.4S, v30.4S // ..............................................................................................*........... - // str q13, [x0, #256] // ....................................................................................................*..... - // cmge v13.4S, v31.4S, v18.4S // .............................................................................................*............ - // mls v12.4S, v23.4S, v8.S[0] // ...................................................*...................................................... - // mls v5.4S, v7.4S, v8.S[0] // ............................................*............................................................. - // cmge v23.4S, v14.4S, v30.4S // .....................................................................................*.................... - // mls v9.4S, v17.4S, v8.S[0] // .....................................................*.................................................... - // sub v7.4S, v13.4S, v28.4S // ..................................................................................................*....... - // sub v28.4S, v11.4S, v4.4S // ....................................................................*..................................... - // sub v23.4S, v10.4S, v23.4S // ..........................................................................................*............... - // cmge v13.4S, v31.4S, v5.4S // .........................................................................................*................ - // cmge v4.4S, v31.4S, v12.4S // ...............................................................*.......................................... - // cmge v21.4S, v31.4S, v9.4S // ............................................................*............................................. - // mls v18.4S, v7.4S, v29.4S // ......................................................................................................*... - // cmge v17.4S, v5.4S, v30.4S // ......................................................................................*................... - // cmge v7.4S, v9.4S, v30.4S // ..........................................................*............................................... - // cmge v11.4S, v12.4S, v30.4S // ...........................................................*.............................................. - // mls v14.4S, v23.4S, v29.4S // ................................................................................................*......... - // str q18, [x0, #384] // .........................................................................................................* - // sub v17.4S, v13.4S, v17.4S // ...............................................................................................*.......... - // mls v19.4S, v28.4S, v29.4S // .........................................................................*................................ - // sub v28.4S, v21.4S, v7.4S // ................................................................*......................................... - // mls v15.4S, v24.4S, v29.4S // .................................................................................................*........ - // sub v21.4S, v4.4S, v11.4S // ...................................................................*...................................... - // str q14, [x0, #512] // .....................................................................................................*.... - // mls v5.4S, v17.4S, v29.4S // ...................................................................................................*...... - // mls v9.4S, v28.4S, v29.4S // .....................................................................*.................................... - // str q19, [x0, #640] // .................................................................................*........................ - // mls v12.4S, v21.4S, v29.4S // .......................................................................*.................................. - // str q15, [x0, #128] // .......................................................................................................*.. - // str q5, [x0, #768] // ........................................................................................................*. - // str q9, [x0, #896] // ..........................................................................*............................... - // str q12, [x0], #(16) // .............................................................................*............................ + // Instructions: 25 + // Expected cycles: 14 + // Expected IPC: 1.79 + // + // Wall time: 0.08s + // User time: 0.08s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + str q7, [x0, #896] // ..............*............... + sqrdmulh v7.4S, v17.4S, v26.4S // *............................. + cmge v5.4S, v9.4S, v30.4S // ..*........................... + // gap // .............................. + str q13, [x0, #384] // ...................*.......... + cmge v11.4S, v27.4S, v30.4S // ...*.......................... + // gap // .............................. + // gap // .............................. + sub v17.4S, v22.4S, v5.4S // .......*...................... + cmge v22.4S, v31.4S, v24.4S // .....*........................ + // gap // .............................. + // gap // .............................. + mls v23.4S, v7.4S, v8.S[0] // ......*....................... + sub v7.4S, v28.4S, v11.4S // ........*..................... + // gap // .............................. + // gap // .............................. + mls v9.4S, v17.4S, v8.4S // ..........*................... + sub v17.4S, v22.4S, v14.4S // .........*.................... + // gap // .............................. + // gap // .............................. + mls v27.4S, v7.4S, v8.4S // .............*................ + cmge v28.4S, v10.4S, v30.4S // .*............................ + // gap // .............................. + // gap // .............................. + cmge v11.4S, v31.4S, v23.4S // ...........*.................. + cmge v21.4S, v23.4S, v30.4S // .....................*........ + // gap // .............................. + // gap // .............................. + sub v28.4S, v29.4S, v28.4S // ....*......................... + mls v24.4S, v17.4S, v8.4S // .................*............ + str q9, [x0, #256] // ....................*......... + // gap // .............................. + str q27, [x0, #640] // ...............*.............. + sub v16.4S, v11.4S, v21.4S // ......................*....... + // gap // .............................. + // gap // .............................. + mls v10.4S, v28.4S, v8.4S // ............*................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v23.4S, v16.4S, v8.4S // .......................*...... + str q24, [x0, #128] // ..................*........... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q10, [x0, #512] // ................*............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q23, [x0], #(16) // ........................*..... + // gap // .............................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // sqrdmulh v5.4S, v17.4S, v26.4S // .*............................. + // cmge v18.4S, v10.4S, v30.4S // ............*.................. + // cmge v16.4S, v9.4S, v30.4S // ..*............................ + // cmge v17.4S, v27.4S, v30.4S // ....*.......................... + // sub v6.4S, v29.4S, v18.4S // ...............*............... + // cmge v21.4S, v31.4S, v24.4S // ......*........................ + // mls v23.4S, v5.4S, v8.S[0] // .......*....................... + // sub v16.4S, v22.4S, v16.4S // .....*......................... + // sub v18.4S, v28.4S, v17.4S // ........*...................... + // sub v22.4S, v21.4S, v14.4S // ..........*.................... + // mls v9.4S, v16.4S, v8.4S // .........*..................... + // cmge v16.4S, v31.4S, v23.4S // .............*................. + // mls v10.4S, v6.4S, v8.4S // ....................*.......... + // mls v27.4S, v18.4S, v8.4S // ...........*................... + // str q7, [x0, #896] // *.............................. + // str q27, [x0, #640] // ..................*............ + // str q10, [x0, #512] // .......................*....... + // mls v24.4S, v22.4S, v8.4S // ................*.............. + // str q24, [x0, #128] // ......................*........ + // str q13, [x0, #384] // ...*........................... + // str q9, [x0, #256] // .................*............. + // cmge v21.4S, v23.4S, v30.4S // ..............*................ + // sub v18.4S, v16.4S, v21.4S // ...................*........... + // mls v23.4S, v18.4S, v8.4S // .....................*......... + // str q23, [x0], #(16) // ........................*...... pop_stack diff --git a/tests/ntt_dilithium/manual/ntt_dilithium_1234_5678_manual_st4.s b/tests/ntt_dilithium/manual/ntt_dilithium_1234_5678_manual_st4.s index 814609b..f793d2b 100644 --- a/tests/ntt_dilithium/manual/ntt_dilithium_1234_5678_manual_st4.s +++ b/tests/ntt_dilithium/manual/ntt_dilithium_1234_5678_manual_st4.s @@ -143,7 +143,7 @@ trn1 \data1\().2d, t1.2d, t3.2d .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -154,7 +154,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -164,7 +164,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -172,7 +172,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -183,19 +183,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs From 06a7216289a934cf0bd228ee43209a4869db6ee7 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Tue, 9 Apr 2024 11:58:21 +0200 Subject: [PATCH 17/18] Make Dilithium invNTT test aware of canonicality of the result --- tests/ntt_dilithium/main.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/tests/ntt_dilithium/main.c b/tests/ntt_dilithium/main.c index b6759fb..6f9900d 100644 --- a/tests/ntt_dilithium/main.c +++ b/tests/ntt_dilithium/main.c @@ -369,20 +369,20 @@ MAKE_TEST(asm_123_45678_opt_a55,0,ntt_dilithium_123_45678_opt_a55,ntt_u32_C,0,0, MAKE_TEST(asm_123_45678_manual_st4_opt_a55,0,ntt_dilithium_123_45678_manual_st4_opt_a55,ntt_u32_C,0,0,1) MAKE_TEST(asm_123_45678_w_scalar_opt_a55,0,ntt_dilithium_123_45678_w_scalar_opt_a55,ntt_u32_C,0,0,1) -MAKE_TEST(asm_123_45678_inv_opt_a55,1,intt_dilithium_123_45678_opt_a55,invntt_u32_tomont_C,0,0,1) -MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_a55,1,intt_dilithium_123_45678_manual_ld4_opt_a55,invntt_u32_tomont_C,0,0,1) -MAKE_TEST(asm_1234_5678_inv_opt_a55,1,intt_dilithium_1234_5678_opt_a55,invntt_u32_tomont_C,0,0,1) -MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_a55,1,intt_dilithium_1234_5678_manual_ld4_opt_a55,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(asm_123_45678_inv_opt_a55,1,intt_dilithium_123_45678_opt_a55,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_a55,1,intt_dilithium_123_45678_manual_ld4_opt_a55,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_1234_5678_inv_opt_a55,1,intt_dilithium_1234_5678_opt_a55,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_a55,1,intt_dilithium_1234_5678_manual_ld4_opt_a55,invntt_u32_tomont_C,0,1,1) // A72 MAKE_TEST(asm_123_45678_opt_a72,0,ntt_dilithium_123_45678_opt_a72,ntt_u32_C,0,0,1) MAKE_TEST(asm_123_45678_manual_st4_opt_a72,0,ntt_dilithium_123_45678_manual_st4_opt_a72,ntt_u32_C,0,0,1) MAKE_TEST(asm_1234_5678_opt_a72,0,ntt_dilithium_1234_5678_opt_a72,ntt_u32_C,0,0,1) -MAKE_TEST(asm_123_45678_inv_opt_a72,1,intt_dilithium_123_45678_opt_a72,invntt_u32_tomont_C,0,0,1) -MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_a72,1,intt_dilithium_123_45678_manual_ld4_opt_a72,invntt_u32_tomont_C,0,0,1) -MAKE_TEST(asm_1234_5678_inv_opt_a72,1,intt_dilithium_1234_5678_opt_a72,invntt_u32_tomont_C,0,0,1) -MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_a72,1,intt_dilithium_1234_5678_manual_ld4_opt_a72,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(asm_123_45678_inv_opt_a72,1,intt_dilithium_123_45678_opt_a72,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_a72,1,intt_dilithium_123_45678_manual_ld4_opt_a72,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_1234_5678_inv_opt_a72,1,intt_dilithium_1234_5678_opt_a72,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_a72,1,intt_dilithium_1234_5678_manual_ld4_opt_a72,invntt_u32_tomont_C,0,1,1) // M1 Firestorm MAKE_TEST(asm_123_45678_opt_m1_firestorm,0,ntt_dilithium_123_45678_opt_m1_firestorm,ntt_u32_C,0,0,1) @@ -390,10 +390,10 @@ MAKE_TEST(asm_123_45678_manual_st4_opt_m1_firestorm,0,ntt_dilithium_123_45678_ma MAKE_TEST(asm_1234_5678_opt_m1_firestorm,0,ntt_dilithium_1234_5678_opt_m1_firestorm,ntt_u32_C,0,0,1) MAKE_TEST(asm_1234_5678_manual_st4_opt_m1_firestorm,0,ntt_dilithium_1234_5678_manual_st4_opt_m1_firestorm,ntt_u32_C,0,0,1) -MAKE_TEST(asm_123_45678_inv_opt_m1_firestorm,1,intt_dilithium_123_45678_opt_m1_firestorm,invntt_u32_tomont_C,0,0,1) -MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_m1_firestorm,1,intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm,invntt_u32_tomont_C,0,0,1) -MAKE_TEST(asm_1234_5678_inv_opt_m1_firestorm,1,intt_dilithium_1234_5678_opt_m1_firestorm,invntt_u32_tomont_C,0,0,1) -MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_m1_firestorm,1,intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(asm_123_45678_inv_opt_m1_firestorm,1,intt_dilithium_123_45678_opt_m1_firestorm,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_m1_firestorm,1,intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_1234_5678_inv_opt_m1_firestorm,1,intt_dilithium_1234_5678_opt_m1_firestorm,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_m1_firestorm,1,intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm,invntt_u32_tomont_C,0,1,1) // M1 Icestorm MAKE_TEST(asm_123_45678_opt_m1_icestorm,0,ntt_dilithium_123_45678_opt_m1_icestorm,ntt_u32_C,0,0,1) @@ -402,16 +402,16 @@ MAKE_TEST(asm_123_45678_w_scalar_opt_m1_icestorm,0,ntt_dilithium_123_45678_w_sca MAKE_TEST(asm_1234_5678_opt_m1_icestorm,0,ntt_dilithium_1234_5678_opt_m1_icestorm,ntt_u32_C,0,0,1) MAKE_TEST(asm_1234_5678_manual_st4_opt_m1_icestorm,0,ntt_dilithium_1234_5678_manual_st4_opt_m1_icestorm,ntt_u32_C,0,0,1) -MAKE_TEST(asm_123_45678_inv_opt_m1_icestorm,1,intt_dilithium_123_45678_opt_m1_icestorm,invntt_u32_tomont_C,0,0,1) -MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_m1_icestorm,1,intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm,invntt_u32_tomont_C,0,0,1) -MAKE_TEST(asm_1234_5678_inv_opt_m1_icestorm,1,intt_dilithium_1234_5678_opt_m1_icestorm,invntt_u32_tomont_C,0,0,1) -MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_m1_icestorm,1,intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(asm_123_45678_inv_opt_m1_icestorm,1,intt_dilithium_123_45678_opt_m1_icestorm,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_123_45678_inv_manual_ld4_opt_m1_icestorm,1,intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_1234_5678_inv_opt_m1_icestorm,1,intt_dilithium_1234_5678_opt_m1_icestorm,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(asm_1234_5678_inv_manual_ld4_opt_m1_icestorm,1,intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm,invntt_u32_tomont_C,0,1,1) // Other MAKE_TEST(neonntt_fwd,0,ntt,ntt_u32_C,0,0,1) MAKE_TEST(pqclean_ntt_fwd,0,pqclean_ntt,ntt_u32_C,0,0,1) -MAKE_TEST(neonntt_inv,1,invntt_tomont,invntt_u32_tomont_C,0,0,1) -MAKE_TEST(pqclean_ntt_inv,1,pqclean_invntt_tomont,invntt_u32_tomont_C,0,0,1) +MAKE_TEST(neonntt_inv,1,invntt_tomont,invntt_u32_tomont_C,0,1,1) +MAKE_TEST(pqclean_ntt_inv,1,pqclean_invntt_tomont,invntt_u32_tomont_C,0,1,1) uint64_t t0, t1; uint64_t cycles[TEST_COUNT]; From 28664103d7219c6596beab92c677b485113a0703 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Thu, 11 Apr 2024 09:53:07 +0200 Subject: [PATCH 18/18] Update asm in test folders for consistency --- .../manual/intt_dilithium_1234_5678.s | 69 ++++----- .../intt_dilithium_1234_5678_manual_ld4.s | 69 ++++----- ...t_dilithium_1234_5678_manual_ld4_opt_a55.s | 2 +- ...t_dilithium_1234_5678_manual_ld4_opt_a72.s | 2 +- ...um_1234_5678_manual_ld4_opt_m1_firestorm.s | 2 +- ...ium_1234_5678_manual_ld4_opt_m1_icestorm.s | 2 +- .../manual/intt_dilithium_1234_5678_opt_a55.s | 2 +- .../manual/intt_dilithium_1234_5678_opt_a72.s | 2 +- ...ntt_dilithium_1234_5678_opt_m1_firestorm.s | 2 +- ...intt_dilithium_1234_5678_opt_m1_icestorm.s | 2 +- .../manual/intt_dilithium_123_45678.s | 45 +++--- .../intt_dilithium_123_45678_manual_ld4.s | 45 +++--- ...t_dilithium_123_45678_manual_ld4_opt_a55.s | 2 +- ...t_dilithium_123_45678_manual_ld4_opt_a72.s | 2 +- ...um_123_45678_manual_ld4_opt_m1_firestorm.s | 2 +- ...ium_123_45678_manual_ld4_opt_m1_icestorm.s | 2 +- .../manual/intt_dilithium_123_45678_opt_a55.s | 2 +- .../manual/intt_dilithium_123_45678_opt_a72.s | 2 +- ...ntt_dilithium_123_45678_opt_m1_firestorm.s | 2 +- ...intt_dilithium_123_45678_opt_m1_icestorm.s | 2 +- .../ntt_dilithium_1234_5678_manual_st4.s | 14 +- tests/ntt_kyber/manual/intt_kyber_123_4567.s | 49 ++++--- .../manual/intt_kyber_123_4567_manual_ld4.s | 49 ++++--- tests/ntt_kyber/manual/ntt_kyber_1234_567.s | 133 ++++++------------ .../manual/ntt_kyber_1234_567_twiddles.s | 3 +- tests/ntt_kyber/manual/ntt_kyber_123_4567.s | 94 ++++--------- .../manual/ntt_kyber_123_4567_scalar_load.s | 105 ++++---------- .../manual/ntt_kyber_123_45_67_twiddles.s | 1 - 28 files changed, 304 insertions(+), 404 deletions(-) diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678.s index 8228b2c..1e8008f 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678.s @@ -85,18 +85,18 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmls \dst, \src, modulus + vmls \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmls \dst, t2, modulus .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, modulus .endm @@ -114,12 +114,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted vsub tmp, \a, \b vadd \a, \a, \b @@ -235,6 +229,12 @@ restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -360,8 +360,8 @@ layer5678_start: gs_butterfly data0, data2, root1, 0, 1 gs_butterfly data1, data3, root1, 0, 1 - montg_reduce data0 - montg_reduce data1 + barrett_reduce_single data0 + barrett_reduce_single data1 str_vi data0, inp, (16*4) str_vo data1, inp, (-16*4 + 1*16) @@ -486,25 +486,28 @@ layer1234_start: str_vo data14, in, (14*(512/8)) str_vo data15, in, (15*(512/8)) - mul_ninv data8, data9, data10, data11, data12, data13, data14, data15, data0, data1, data2, data3, data4, data5, data6, data7 - - canonical_reduce data8, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data9, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data10, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data11, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data12, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data13, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data14, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data15, modulus_half, neg_modulus_half, t2, t3 - - str_vi data8, in, (16) - str_vo data9, in, (-16 + 1*(512/8)) - str_vo data10, in, (-16 + 2*(512/8)) - str_vo data11, in, (-16 + 3*(512/8)) - str_vo data12, in, (-16 + 4*(512/8)) - str_vo data13, in, (-16 + 5*(512/8)) - str_vo data14, in, (-16 + 6*(512/8)) - str_vo data15, in, (-16 + 7*(512/8)) + // Scale half the coeffs by 1/n; for the other half, the scaling has + // been merged into the multiplication with the twiddle factor on the + // last layer. + mul_ninv data0, data1, data2, data3, data4, data5, data6, data7, data0, data1, data2, data3, data4, data5, data6, data7 + + canonical_reduce data0, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data1, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 + + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(512/8)) + str_vo data2, in, (-16 + 2*(512/8)) + str_vo data3, in, (-16 + 3*(512/8)) + str_vo data4, in, (-16 + 4*(512/8)) + str_vo data5, in, (-16 + 5*(512/8)) + str_vo data6, in, (-16 + 6*(512/8)) + str_vo data7, in, (-16 + 7*(512/8)) // layer1234_end: subs count, count, #1 diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4.s index 153895c..22d2475 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4.s @@ -85,18 +85,18 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmls \dst, \src, modulus + vmls \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmls \dst, t2, modulus .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, modulus .endm @@ -114,12 +114,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted vsub tmp, \a, \b vadd \a, \a, \b @@ -235,6 +229,12 @@ restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -358,8 +358,8 @@ layer5678_start: gs_butterfly data0, data2, root1, 0, 1 gs_butterfly data1, data3, root1, 0, 1 - montg_reduce data0 - montg_reduce data1 + barrett_reduce_single data0 + barrett_reduce_single data1 str_vi data0, inp, (16*4) str_vo data1, inp, (-16*4 + 1*16) @@ -484,25 +484,28 @@ layer1234_start: str_vo data14, in, (14*(512/8)) str_vo data15, in, (15*(512/8)) - mul_ninv data8, data9, data10, data11, data12, data13, data14, data15, data0, data1, data2, data3, data4, data5, data6, data7 - - canonical_reduce data8, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data9, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data10, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data11, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data12, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data13, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data14, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data15, modulus_half, neg_modulus_half, t2, t3 - - str_vi data8, in, (16) - str_vo data9, in, (-16 + 1*(512/8)) - str_vo data10, in, (-16 + 2*(512/8)) - str_vo data11, in, (-16 + 3*(512/8)) - str_vo data12, in, (-16 + 4*(512/8)) - str_vo data13, in, (-16 + 5*(512/8)) - str_vo data14, in, (-16 + 6*(512/8)) - str_vo data15, in, (-16 + 7*(512/8)) + // Scale half the coeffs by 1/n; for the other half, the scaling has + // been merged into the multiplication with the twiddle factor on the + // last layer. + mul_ninv data0, data1, data2, data3, data4, data5, data6, data7, data0, data1, data2, data3, data4, data5, data6, data7 + + canonical_reduce data0, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data1, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 + + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(512/8)) + str_vo data2, in, (-16 + 2*(512/8)) + str_vo data3, in, (-16 + 3*(512/8)) + str_vo data4, in, (-16 + 4*(512/8)) + str_vo data5, in, (-16 + 5*(512/8)) + str_vo data6, in, (-16 + 6*(512/8)) + str_vo data7, in, (-16 + 7*(512/8)) // layer1234_end: subs count, count, #1 diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_a55.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_a55.s index 6298588..4b948c7 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_a55.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_a55.s @@ -96,7 +96,7 @@ vmls \dst, \src, modulus .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, modulus .endm diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_a72.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_a72.s index 5c8b133..568e8ff 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_a72.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_a72.s @@ -96,7 +96,7 @@ vmls \dst, \src, modulus .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, modulus .endm diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s index 864bab4..c23659d 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s @@ -96,7 +96,7 @@ vmls \dst, \src, modulus .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, modulus .endm diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s index 53d25aa..e23094c 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s @@ -96,7 +96,7 @@ vmls \dst, \src, modulus .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, modulus .endm diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_a55.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_a55.s index e1ba3ba..92b51ec 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_a55.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_a55.s @@ -96,7 +96,7 @@ vmls \dst, \src, modulus .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, modulus .endm diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_a72.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_a72.s index 3440ef6..8ae4f0b 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_a72.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_a72.s @@ -96,7 +96,7 @@ vmls \dst, \src, modulus .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, modulus .endm diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_m1_firestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_m1_firestorm.s index 3d99a3b..ff52d0f 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_m1_firestorm.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_m1_firestorm.s @@ -96,7 +96,7 @@ vmls \dst, \src, modulus .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, modulus .endm diff --git a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_m1_icestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_m1_icestorm.s index e734625..d449be9 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_m1_icestorm.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_1234_5678_opt_m1_icestorm.s @@ -96,7 +96,7 @@ vmls \dst, \src, modulus .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, modulus .endm diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678.s index efd5336..598a1a9 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678.s @@ -47,18 +47,18 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, consts .endm @@ -245,6 +245,12 @@ xtmp1 .req x11 restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -428,10 +434,10 @@ layer45678_start: gs_butterfly data5, data7, root1, 0, 1 // Interm. Reduction - montg_reduce data0 - montg_reduce data1 - montg_reduce data4 - montg_reduce data5 + barrett_reduce_single data0 + barrett_reduce_single data1 + barrett_reduce_single data4 + barrett_reduce_single data5 // Layer 4 gs_butterfly data0, data4, root0, 0, 1 @@ -514,17 +520,20 @@ layer123_start: str_vo data6, in, (6*(1024/8)) str_vo data7, in, (7*(1024/8)) - mul_ninv data4, data5, data6, data7, data0, data1, data2, data3 + // Scale half the coeffs by 1/n; for the other half, the scaling has + // been merged into the multiplication with the twiddle factor on the + // last layer. + mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 - canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data0, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data1, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3 - str_vi data4, in, (16) - str_vo data5, in, (-16 + 1*(1024/8)) - str_vo data6, in, (-16 + 2*(1024/8)) - str_vo data7, in, (-16 + 3*(1024/8)) + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(1024/8)) + str_vo data2, in, (-16 + 2*(1024/8)) + str_vo data3, in, (-16 + 3*(1024/8)) subs count, count, #1 cbnz count, layer123_start diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4.s index 464e047..69dc2c2 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4.s @@ -47,18 +47,18 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, consts .endm @@ -245,6 +245,12 @@ xtmp1 .req x11 restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -437,10 +443,10 @@ layer45678_start: gs_butterfly data5, data7, root1, 0, 1 // Interm. Reduction - montg_reduce data0 - montg_reduce data1 - montg_reduce data4 - montg_reduce data5 + barrett_reduce_single data0 + barrett_reduce_single data1 + barrett_reduce_single data4 + barrett_reduce_single data5 // Layer 4 gs_butterfly data0, data4, root0, 0, 1 @@ -523,17 +529,20 @@ layer123_start: str_vo data6, in, (6*(1024/8)) str_vo data7, in, (7*(1024/8)) - mul_ninv data4, data5, data6, data7, data0, data1, data2, data3 + // Scale half the coeffs by 1/n; for the other half, the scaling has + // been merged into the multiplication with the twiddle factor on the + // last layer. + mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 - canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data0, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data1, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3 - str_vi data4, in, (16) - str_vo data5, in, (-16 + 1*(1024/8)) - str_vo data6, in, (-16 + 2*(1024/8)) - str_vo data7, in, (-16 + 3*(1024/8)) + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(1024/8)) + str_vo data2, in, (-16 + 2*(1024/8)) + str_vo data3, in, (-16 + 3*(1024/8)) subs count, count, #1 cbnz count, layer123_start diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a55.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a55.s index cb0727d..12c8552 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a55.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a55.s @@ -58,7 +58,7 @@ xtmp1 .req x11 vmlsq \dst, \src, consts, 0 .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, consts .endm diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a72.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a72.s index b31538c..e58113a 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a72.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_a72.s @@ -58,7 +58,7 @@ xtmp1 .req x11 vmlsq \dst, \src, consts, 0 .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, consts .endm diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s index 844e4e4..f05c8e9 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s @@ -58,7 +58,7 @@ xtmp1 .req x11 vmlsq \dst, \src, consts, 0 .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, consts .endm diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s index fa7a305..d6caacd 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s @@ -58,7 +58,7 @@ xtmp1 .req x11 vmlsq \dst, \src, consts, 0 .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, consts .endm diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a55.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a55.s index d8e53dc..efd56dc 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a55.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a55.s @@ -58,7 +58,7 @@ xtmp1 .req x11 vmlsq \dst, \src, consts, 0 .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, consts .endm diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a72.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a72.s index 11cceb8..d3b6904 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a72.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_a72.s @@ -58,7 +58,7 @@ xtmp1 .req x11 vmlsq \dst, \src, consts, 0 .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, consts .endm diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_firestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_firestorm.s index 78aa47d..4d58912 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_firestorm.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_firestorm.s @@ -58,7 +58,7 @@ xtmp1 .req x11 vmlsq \dst, \src, consts, 0 .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, consts .endm diff --git a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_icestorm.s b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_icestorm.s index 671e038..eb5264d 100644 --- a/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_icestorm.s +++ b/tests/ntt_dilithium/manual/intt_dilithium_123_45678_opt_m1_icestorm.s @@ -58,7 +58,7 @@ xtmp1 .req x11 vmlsq \dst, \src, consts, 0 .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, consts .endm diff --git a/tests/ntt_dilithium/manual/ntt_dilithium_1234_5678_manual_st4.s b/tests/ntt_dilithium/manual/ntt_dilithium_1234_5678_manual_st4.s index f793d2b..2f5d42a 100644 --- a/tests/ntt_dilithium/manual/ntt_dilithium_1234_5678_manual_st4.s +++ b/tests/ntt_dilithium/manual/ntt_dilithium_1234_5678_manual_st4.s @@ -67,15 +67,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmla \dst, \src, modulus + vmla \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlaq \dst, \src, modulus, 0 + vmla \dst, t2, modulus .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -84,12 +84,6 @@ add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlaq \dst, \src, modulus, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567.s b/tests/ntt_kyber/manual/intt_kyber_123_4567.s index 77029b5..9cb0b6c 100644 --- a/tests/ntt_kyber/manual/intt_kyber_123_4567.s +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567.s @@ -67,15 +67,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -84,12 +84,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.8h, \a\().8h, \b\().8h add \a\().8h, \a\().8h, \b\().8h @@ -146,7 +140,7 @@ trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -157,7 +151,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -167,7 +161,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -175,7 +169,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -186,24 +180,30 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are NOT canonically reduced. The ordering of the coefficients is canonical, +// also matching PQClean. + .data .p2align 4 roots: @@ -443,12 +443,15 @@ layer123_start: str_vo data6, in, (6*(512/8)) str_vo data7, in, (7*(512/8)) - mul_ninv data4, data5, data6, data7, data0, data1, data2, data3 + // Scale half the coeffs by 1/n; for the other half, the scaling has + // been merged into the multiplication with the twiddle factor on the + // last layer. + mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 - str_vi data4, in, (16) - str_vo data5, in, (-16 + 1*(512/8)) - str_vo data6, in, (-16 + 2*(512/8)) - str_vo data7, in, (-16 + 3*(512/8)) + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(512/8)) + str_vo data2, in, (-16 + 2*(512/8)) + str_vo data3, in, (-16 + 3*(512/8)) subs count, count, #1 diff --git a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4.s b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4.s index 4f8df6a..2bcc941 100644 --- a/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4.s +++ b/tests/ntt_kyber/manual/intt_kyber_123_4567_manual_ld4.s @@ -67,15 +67,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -84,12 +84,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.8h, \a\().8h, \b\().8h add \a\().8h, \a\().8h, \b\().8h @@ -146,7 +140,7 @@ trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -157,7 +151,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -167,7 +161,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -175,7 +169,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -186,24 +180,30 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are NOT canonically reduced. The ordering of the coefficients is canonical, +// also matching PQClean. + .data .p2align 4 roots: @@ -438,12 +438,15 @@ layer123_start: str_vo data6, in, (6*(512/8)) str_vo data7, in, (7*(512/8)) - mul_ninv data4, data5, data6, data7, data0, data1, data2, data3 + // Scale half the coeffs by 1/n; for the other half, the scaling has + // been merged into the multiplication with the twiddle factor on the + // last layer. + mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 - str_vi data4, in, (16) - str_vo data5, in, (-16 + 1*(512/8)) - str_vo data6, in, (-16 + 2*(512/8)) - str_vo data7, in, (-16 + 3*(512/8)) + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(512/8)) + str_vo data2, in, (-16 + 2*(512/8)) + str_vo data3, in, (-16 + 3*(512/8)) subs count, count, #1 diff --git a/tests/ntt_kyber/manual/ntt_kyber_1234_567.s b/tests/ntt_kyber/manual/ntt_kyber_1234_567.s index 9b43d6f..77bb34d 100644 --- a/tests/ntt_kyber/manual/ntt_kyber_1234_567.s +++ b/tests/ntt_kyber/manual/ntt_kyber_1234_567.s @@ -108,36 +108,6 @@ vmlaq \a, t0, consts, 0 .endm -.macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 -.endm - -.macro load_next_roots_45 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 -.endm - -.macro load_next_roots_67 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) -.endm - -.macro transpose4 data - trn1 t0.4s, \data\()0.4s, \data\()1.4s - trn2 t1.4s, \data\()0.4s, \data\()1.4s - trn1 t2.4s, \data\()2.4s, \data\()3.4s - trn2 t3.4s, \data\()2.4s, \data\()3.4s - - trn2 \data\()2.2d, t0.2d, t2.2d - trn2 \data\()3.2d, t1.2d, t3.2d - trn1 \data\()0.2d, t0.2d, t2.2d - trn1 \data\()1.2d, t1.2d, t3.2d -.endm - .macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] @@ -196,29 +166,6 @@ restore_gprs .endm -.data -.p2align 4 -roots: -#include "ntt_kyber_1234_567_twiddles.s" -.text - - .global ntt_kyber_1234_567 - .global _ntt_kyber_1234_567 - -.p2align 4 -const_addr: .short -3329 - .short 20159 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - -ntt_kyber_1234_567: -_ntt_kyber_1234_567: - push_stack - in .req x0 inp .req x1 count .req x2 @@ -228,20 +175,6 @@ _ntt_kyber_1234_567: src0 .req x6 src1 .req x7 - src2 .req x8 - src3 .req x9 - src4 .req x10 - src5 .req x11 - src6 .req x12 - src7 .req x13 - src8 .req x14 - src9 .req x15 - src10 .req x16 - src11 .req x17 - src12 .req x18 - src13 .req x19 - src14 .req x20 - src15 .req x21 qform_v0 .req q0 qform_v1 .req q1 @@ -336,17 +269,43 @@ _ntt_kyber_1234_567: consts .req v8 - ASM_LOAD(r_ptr0, roots) +.data +.p2align 4 +roots: +#include "ntt_kyber_1234_567_twiddles.s" +.text + .global ntt_kyber_1234_567 + .global _ntt_kyber_1234_567 + +.p2align 4 +const_addr: .short -3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + +ntt_kyber_1234_567: +_ntt_kyber_1234_567: + push_stack + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l456) ASM_LOAD(xtmp, const_addr) ld1 {consts.8h}, [xtmp] save STACK0, in add src0, x0, #32*0 - add src8, x0, #32*8 + add src1, x0, #32*8 - ld1 { root0.8h, root1.8h, root2.8h, root3.8h}, [r_ptr0], #64 + ldr_vo root0, r_ptr0, 0 + ldr_vo root1, r_ptr0, 16 + ldr_vo root2, r_ptr0, 32 + ldr_vo root3, r_ptr0, 48 mov count, #2 @@ -362,14 +321,14 @@ layer1234_start: ldr_vo data6, src0, 6*32 ldr_vo data7, src0, 7*32 - ldr_vo data8, src8, 0 - ldr_vo data9, src8, 1*32 - ldr_vo data10, src8, 2*32 - ldr_vo data11, src8, 3*32 - ldr_vo data12, src8, 4*32 - ldr_vo data13, src8, 5*32 - ldr_vo data14, src8, 6*32 - ldr_vo data15, src8, 7*32 + ldr_vo data8, src1, 0 + ldr_vo data9, src1, 1*32 + ldr_vo data10, src1, 2*32 + ldr_vo data11, src1, 3*32 + ldr_vo data12, src1, 4*32 + ldr_vo data13, src1, 5*32 + ldr_vo data14, src1, 6*32 + ldr_vo data15, src1, 7*32 ct_butterfly data0, data8, root0, 0, 1 ct_butterfly data1, data9, root0, 0, 1 @@ -416,14 +375,14 @@ layer1234_start: str_vo data6, src0, -16+6*32 str_vo data7, src0, -16+7*32 - str_vi data8, src8, 16 - str_vo data9, src8, -16+1*32 - str_vo data10, src8, -16+2*32 - str_vo data11, src8, -16+3*32 - str_vo data12, src8, -16+4*32 - str_vo data13, src8, -16+5*32 - str_vo data14, src8, -16+6*32 - str_vo data15, src8, -16+7*32 + str_vi data8, src1, 16 + str_vo data9, src1, -16+1*32 + str_vo data10, src1, -16+2*32 + str_vo data11, src1, -16+3*32 + str_vo data12, src1, -16+4*32 + str_vo data13, src1, -16+5*32 + str_vo data14, src1, -16+6*32 + str_vo data15, src1, -16+7*32 subs count, count, #1 cbnz count, layer1234_start @@ -431,8 +390,6 @@ layer1234_start: restore inp, STACK0 mov count, #4 - ASM_LOAD(r_ptr1, roots_l456) - add src0, inp, #256*0 add src1, inp, #256*1 diff --git a/tests/ntt_kyber/manual/ntt_kyber_1234_567_twiddles.s b/tests/ntt_kyber/manual/ntt_kyber_1234_567_twiddles.s index ceb916f..80629ad 100644 --- a/tests/ntt_kyber/manual/ntt_kyber_1234_567_twiddles.s +++ b/tests/ntt_kyber/manual/ntt_kyber_1234_567_twiddles.s @@ -30,6 +30,7 @@ roots_l0123: .short -1583 .short -15582 +.p2align 4 roots_l456: .short 296 .short 296 @@ -478,4 +479,4 @@ roots_l456: .short 6309 .short 6309 .short -11566 -.short -11566 \ No newline at end of file +.short -11566 diff --git a/tests/ntt_kyber/manual/ntt_kyber_123_4567.s b/tests/ntt_kyber/manual/ntt_kyber_123_4567.s index 778841d..52d7a56 100644 --- a/tests/ntt_kyber/manual/ntt_kyber_123_4567.s +++ b/tests/ntt_kyber/manual/ntt_kyber_123_4567.s @@ -139,27 +139,6 @@ trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // @slothy:no-unfold - sub sp, sp, #(16*6) - stp x19, x20, [sp, #16*0] - stp x19, x20, [sp, #16*0] - stp x21, x22, [sp, #16*1] - stp x23, x24, [sp, #16*2] - stp x25, x26, [sp, #16*3] - stp x27, x28, [sp, #16*4] - str x29, [sp, #16*5] -.endm - -.macro restore_gprs // @slothy:no-unfold - ldp x19, x20, [sp, #16*0] - ldp x21, x22, [sp, #16*1] - ldp x23, x24, [sp, #16*2] - ldp x25, x26, [sp, #16*3] - ldp x27, x28, [sp, #16*4] - ldr x29, [sp, #16*5] - add sp, sp, #(16*6) -.endm - .macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] @@ -176,51 +155,16 @@ add sp, sp, #(16*4) .endm -#define STACK_SIZE 16 -#define STACK0 0 - -.macro restore a, loc // @slothy:no-unfold - ldr \a, [sp, #\loc\()] -.endm -.macro save loc, a // @slothy:no-unfold - str \a, [sp, #\loc\()] -.endm .macro push_stack // @slothy:no-unfold - save_gprs save_vregs - sub sp, sp, #STACK_SIZE .endm .macro pop_stack // @slothy:no-unfold - add sp, sp, #STACK_SIZE restore_vregs - restore_gprs .endm -.data -.p2align 4 -roots: -#include "ntt_kyber_123_45_67_twiddles.s" -.text - - .global ntt_kyber_123_4567 - .global _ntt_kyber_123_4567 - -.p2align 4 -const_addr: .short 3329 - .short 20159 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 -ntt_kyber_123_4567: -_ntt_kyber_123_4567: - push_stack - in .req x0 - inp .req x1 + in_orig .req x1 count .req x2 r_ptr0 .req x3 r_ptr1 .req x4 @@ -318,13 +262,35 @@ _ntt_kyber_123_4567: t2 .req v27 t3 .req v28 +.data +.p2align 4 +roots: +#include "ntt_kyber_123_45_67_twiddles.s" +.text + + .global ntt_kyber_123_4567 + .global _ntt_kyber_123_4567 + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ntt_kyber_123_4567: +_ntt_kyber_123_4567: + push_stack + ASM_LOAD(r_ptr0, roots) ASM_LOAD(r_ptr1, roots_l56) ASM_LOAD(xtmp, const_addr) ld1 {consts.8h}, [xtmp] - save STACK0, in + mov in_orig, in mov count, #4 load_roots_123 @@ -368,15 +334,15 @@ layer123_start: subs count, count, #1 cbnz count, layer123_start - restore inp, STACK0 + mov in, in_orig mov count, #8 .p2align 2 layer4567_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr_vo data0, in, (16*0) + ldr_vo data1, in, (16*1) + ldr_vo data2, in, (16*2) + ldr_vo data3, in, (16*3) load_next_roots_45 @@ -397,7 +363,7 @@ layer4567_start: barrett_reduce data1 barrett_reduce data2 barrett_reduce data3 - st4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp], #64 + st4 {data0.4S, data1.4S, data2.4S, data3.4S}, [in], #64 subs count, count, #1 cbnz count, layer4567_start diff --git a/tests/ntt_kyber/manual/ntt_kyber_123_4567_scalar_load.s b/tests/ntt_kyber/manual/ntt_kyber_123_4567_scalar_load.s index 781b049..992e876 100644 --- a/tests/ntt_kyber/manual/ntt_kyber_123_4567_scalar_load.s +++ b/tests/ntt_kyber/manual/ntt_kyber_123_4567_scalar_load.s @@ -151,27 +151,6 @@ xtmp1 .req x11 trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // @slothy:no-unfold - sub sp, sp, #(16*6) - stp x19, x20, [sp, #16*0] - stp x19, x20, [sp, #16*0] - stp x21, x22, [sp, #16*1] - stp x23, x24, [sp, #16*2] - stp x25, x26, [sp, #16*3] - stp x27, x28, [sp, #16*4] - str x29, [sp, #16*5] -.endm - -.macro restore_gprs // @slothy:no-unfold - ldp x19, x20, [sp, #16*0] - ldp x21, x22, [sp, #16*1] - ldp x23, x24, [sp, #16*2] - ldp x25, x26, [sp, #16*3] - ldp x27, x28, [sp, #16*4] - ldr x29, [sp, #16*5] - add sp, sp, #(16*6) -.endm - .macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] @@ -188,9 +167,6 @@ xtmp1 .req x11 add sp, sp, #(16*4) .endm -#define STACK_SIZE 16 -#define STACK0 0 - .macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm @@ -198,41 +174,15 @@ xtmp1 .req x11 str \a, [sp, #\loc\()] .endm .macro push_stack // @slothy:no-unfold - save_gprs save_vregs - sub sp, sp, #STACK_SIZE .endm .macro pop_stack // @slothy:no-unfold - add sp, sp, #STACK_SIZE restore_vregs - restore_gprs .endm -.data -.p2align 4 -roots: -#include "ntt_kyber_123_45_67_twiddles.s" -.text - - .global ntt_kyber_123_4567_scalar_load - .global _ntt_kyber_123_4567_scalar_load - -.p2align 4 -const_addr: .short 3329 - .short 20159 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 -ntt_kyber_123_4567_scalar_load: -_ntt_kyber_123_4567_scalar_load: - push_stack - in .req x0 - inp .req x1 + in_orig .req x1 count .req x2 r_ptr0 .req x3 r_ptr1 .req x4 @@ -280,24 +230,6 @@ _ntt_kyber_123_4567_scalar_load: data6 .req v14 data7 .req v15 - x_00 .req x10 - x_01 .req x11 - x_10 .req x12 - x_11 .req x13 - x_20 .req x14 - x_21 .req x15 - x_30 .req x16 - x_31 .req x17 - - xt_00 .req x_00 - xt_01 .req x_20 - xt_10 .req x_10 - xt_11 .req x_30 - xt_20 .req x_01 - xt_21 .req x_21 - xt_30 .req x_11 - xt_31 .req x_31 - qform_data0 .req q8 qform_data1 .req q9 qform_data2 .req q10 @@ -330,13 +262,34 @@ _ntt_kyber_123_4567_scalar_load: t2 .req v27 t3 .req v28 +.data +.p2align 4 +roots: +#include "ntt_kyber_123_45_67_twiddles.s" +.text + + .global ntt_kyber_123_4567_scalar_load + .global _ntt_kyber_123_4567_scalar_load + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ntt_kyber_123_4567_scalar_load: +_ntt_kyber_123_4567_scalar_load: + push_stack ASM_LOAD(r_ptr0, roots) ASM_LOAD(r_ptr1, roots_l56) ASM_LOAD(xtmp, const_addr) ld1 {consts.8h}, [xtmp] - save STACK0, in + mov in_orig, in mov count, #4 load_roots_123 @@ -380,15 +333,15 @@ layer123_start: subs count, count, #1 cbnz count, layer123_start - restore inp, STACK0 + mov in, in_orig mov count, #8 .p2align 2 layer4567_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr_vo data0, in, (16*0) + ldr_vo data1, in, (16*1) + ldr_vo data2, in, (16*2) + ldr_vo data3, in, (16*3) load_next_roots_45 @@ -409,7 +362,7 @@ layer4567_start: barrett_reduce data1 barrett_reduce data2 barrett_reduce data3 - st4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp], #64 + st4 {data0.4S, data1.4S, data2.4S, data3.4S}, [in], #64 subs count, count, #1 cbnz count, layer4567_start diff --git a/tests/ntt_kyber/manual/ntt_kyber_123_45_67_twiddles.s b/tests/ntt_kyber/manual/ntt_kyber_123_45_67_twiddles.s index 6015bf4..9fa7dab 100644 --- a/tests/ntt_kyber/manual/ntt_kyber_123_45_67_twiddles.s +++ b/tests/ntt_kyber/manual/ntt_kyber_123_45_67_twiddles.s @@ -1,4 +1,3 @@ - /// /// Copyright (c) 2022 Arm Limited /// Copyright (c) 2022 Hanno Becker